--- /dev/null
+; DO NOT EDIT (unless you know what you are doing)
+;
+; This subdirectory is a git "subrepo", and this file is maintained by the
+; git-subrepo command. See https://github.com/ingydotnet/git-subrepo#readme
+;
+[subrepo]
+ remote = https://github.com/bavison/arm-mem
+ branch = master
+ commit = ee8ac1d56adb7ceef4d39a5cc21a502e41982685
+ parent = 6fb01036deffd69da0af72ad1f5cf2b5fedd04d2
+ method = merge
+ cmdver = 0.4.9
--- /dev/null
+OBJS-V6L = memcmp-v6l.o memcpymove-v6l.o memset-v6l.o
+OBJS-V7L = memcmp-v7l.o memcpymove-v7l.o memset-v7l.o strlen-v7l.o
+CFLAGS += -std=gnu99 -O2 -fno-inline
+
+all: libarmmem-v6l.so libarmmem-v6l.a libarmmem-v7l.so libarmmem-v7l.a test test-strlen
+
+%.o: %.c
+ $(CROSS_COMPILE)gcc $(CFLAGS) -c -o $@ $^
+
+%.o: %.S
+ $(CROSS_COMPILE)gcc -c -o $@ $^
+
+libarmmem-v6l.so: $(OBJS-V6L)
+ $(CROSS_COMPILE)gcc -shared -o $@ -Wl,-soname,$@ $^
+
+libarmmem-v6l.a: $(OBJS-V6L)
+ $(CROSS_COMPILE)ar rcs $@ $^
+
+libarmmem-v7l.so: $(OBJS-V7L)
+ $(CROSS_COMPILE)gcc -shared -o $@ -Wl,-soname,$@ $^
+
+libarmmem-v7l.a: $(OBJS-V7L)
+ $(CROSS_COMPILE)ar rcs $@ $^
+
+test: test.o
+ $(CROSS_COMPILE)gcc -o $@ $^
+
+test-strlen: test-strlen.o
+ $(CROSS_COMPILE)gcc -o $@ $^
+
+clean:
+ rm -rf *.o *.so *.a test
--- /dev/null
+arm-mem
+=======
+
+ARM-accelerated versions of selected functions from string.h
+
+To build the library, use
+$ make
+or, if cross-compiling, use
+$ CROSS_COMPILE=arm-linux-gnueabihf- make
+
+Also included is a simple test harness, inspired by the benchmarker
+from the pixman library. This can be built via the "test" make target.
--- /dev/null
+/*
+Copyright (c) 2013, Raspberry Pi Foundation
+Copyright (c) 2013, RISC OS Open Ltd
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the copyright holder nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+.macro myfunc fname
+ .func fname
+ .global fname
+ .type fname STT_FUNC
+fname:
+.endm
+
+.macro preload_leading_step1 backwards, ptr, base, log2cl
+/* If the destination is already write-block aligned, then we need to preload
+ * between 0 and prefetch_distance (inclusive) cache lines ahead so there
+ * are no gaps when the inner loop starts.
+ */
+ .if backwards
+ sub ptr, base, #1
+ bic ptr, ptr, #(1<<log2cl)-1
+ .else
+ bic ptr, base, #(1<<log2cl)-1
+ .endif
+ .set OFFSET, 0
+ .rept prefetch_distance+1
+ pld [ptr, #OFFSET]
+ .if backwards
+ .set OFFSET, OFFSET-(1<<log2cl)
+ .else
+ .set OFFSET, OFFSET+(1<<log2cl)
+ .endif
+ .endr
+.endm
+
+.macro preload_leading_step2 backwards, ptr, base, log2cl, leading_bytes, tmp
+/* However, if the destination is not write-block aligned, we may need to
+ * preload one more cache line than that. The question we need to ask is:
+ * are the leading bytes more than the amount by which the source
+ * pointer will be rounded down for preloading, and if so, by how many
+ * cache lines?
+ */
+ .if backwards
+/* Here we compare against how many bytes we are into the
+ * cache line, counting down from the highest such address.
+ * Effectively, we want to calculate
+ * leading_bytes = dst&(writeblock-1)
+ * cacheline_offset = (cacheline-1)-((src-leading_bytes-1)&(cacheline-1))
+ * extra_needed = leading_bytes - cacheline_offset
+ * and test if extra_needed is <= 0, or rearranging:
+ * leading_bytes + (src-leading_bytes-1)&(cacheline-1) <= (cacheline-1)
+ */
+ mov tmp, base, lsl #32-2log2cl
+ sbc tmp, tmp, leading_bytes, lsl #32-log2cl @ requires C clear (borrow set) on entry
+ adds tmp, tmp, leading_bytes, lsl #32-log2cl
+ bcc 61f
+ pld [ptr, #-(1<<log2cl)*(prefetch_distance+1)]
+ .else
+/* Effectively, we want to calculate
+ * leading_bytes = (-dst)&(writeblock-1)
+ * cacheline_offset = (src+leading_bytes)&(cacheline-1)
+ * extra_needed = leading_bytes - cacheline_offset
+ * and test if extra_needed is <= 0.
+ */
+ mov tmp, base, lsl #32-log2cl
+ add tmp, tmp, leading_bytes, lsl #32-log2cl
+ rsbs tmp, tmp, leading_bytes, lsl #32-log2cl
+ bls 61f
+ pld [ptr, #(1<<log2cl)*(prefetch_distance+1)]
+ .endif
+61:
+.endm
+
+.macro preload_trailing backwards, base, log2cl, remain, tmp
+ /* We need either 0, 1 or 2 extra preloads */
+ .if backwards
+ rsb tmp, base, #0
+ mov tmp, tmp, lsl #32-log2cl
+ .else
+ mov tmp, base, lsl #32-log2cl
+ .endif
+ adds tmp, tmp, remain, lsl #32-log2cl
+ adceqs tmp, tmp, #0
+ /* The instruction above has two effects: ensures Z is only
+ * set if C was clear (so Z indicates that both shifted quantities
+ * were 0), and clears C if Z was set (so C indicates that the sum
+ * of the shifted quantities was greater and not equal to 32) */
+ beq 82f
+ .if backwards
+ sub tmp, base, #1
+ bic tmp, tmp, #(1<<log2cl)-1
+ .else
+ bic tmp, base, #(1<<log2cl)-1
+ .endif
+ bcc 81f
+ .if backwards
+ pld [tmp, #-(1<<log2cl)*(prefetch_distance+1)]
+81:
+ pld [tmp, #-(1<<log2cl)*prefetch_distance]
+ .else
+ pld [tmp, #(1<<log2cl)*(prefetch_distance+2)]
+81:
+ pld [tmp, #(1<<log2cl)*(prefetch_distance+1)]
+ .endif
+82:
+.endm
+
+.macro preload_all backwards, narrow_case, shift, base, log2cl, remain, tmp0, tmp1
+ .if backwards
+ sub tmp0, base, #1
+ bic tmp0, tmp0, #(1<<log2cl)-1
+ pld [tmp0]
+ sub tmp1, base, remain, lsl #shift
+ .else
+ bic tmp0, base, #(1<<log2cl)-1
+ pld [tmp0]
+ add tmp1, base, remain, lsl #shift
+ sub tmp1, tmp1, #1
+ .endif
+ bic tmp1, tmp1, #(1<<log2cl)-1
+ cmp tmp1, tmp0
+ beq 92f
+ .if narrow_case
+ /* In this case, all the data fits in either 1 or 2 cache lines */
+ pld [tmp1]
+ .else
+91:
+ .if backwards
+ sub tmp0, tmp0, #1<<log2cl
+ .else
+ add tmp0, tmp0, #1<<log2cl
+ .endif
+ cmp tmp0, tmp1
+ pld [tmp0]
+ bne 91b
+ .endif
+92:
+.endm
--- /dev/null
+/*
+Copyright (c) 2013, Raspberry Pi Foundation
+Copyright (c) 2013, RISC OS Open Ltd
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the copyright holder nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "arm-mem.h"
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+ .text
+ .arch armv6
+ .object_arch armv4
+ .arm
+ .altmacro
+ .p2align 2
+
+.macro memcmp_process_head unaligned
+ .if unaligned
+ ldr DAT0, [S_1], #4
+ ldr DAT1, [S_1], #4
+ ldr DAT2, [S_1], #4
+ ldr DAT3, [S_1], #4
+ .else
+ ldmia S_1!, {DAT0, DAT1, DAT2, DAT3}
+ .endif
+ ldmia S_2!, {DAT4, DAT5, DAT6, DAT7}
+.endm
+
+.macro memcmp_process_tail
+ cmp DAT0, DAT4
+ cmpeq DAT1, DAT5
+ cmpeq DAT2, DAT6
+ cmpeq DAT3, DAT7
+ bne 200f
+.endm
+
+.macro memcmp_leading_31bytes
+ movs DAT0, OFF, lsl #31
+ ldrmib DAT0, [S_1], #1
+ ldrcsh DAT1, [S_1], #2
+ ldrmib DAT4, [S_2], #1
+ ldrcsh DAT5, [S_2], #2
+ movpl DAT0, #0
+ movcc DAT1, #0
+ movpl DAT4, #0
+ movcc DAT5, #0
+ submi N, N, #1
+ subcs N, N, #2
+ cmp DAT0, DAT4
+ cmpeq DAT1, DAT5
+ bne 200f
+ movs DAT0, OFF, lsl #29
+ ldrmi DAT0, [S_1], #4
+ ldrcs DAT1, [S_1], #4
+ ldrcs DAT2, [S_1], #4
+ ldrmi DAT4, [S_2], #4
+ ldmcsia S_2!, {DAT5, DAT6}
+ movpl DAT0, #0
+ movcc DAT1, #0
+ movcc DAT2, #0
+ movpl DAT4, #0
+ movcc DAT5, #0
+ movcc DAT6, #0
+ submi N, N, #4
+ subcs N, N, #8
+ cmp DAT0, DAT4
+ cmpeq DAT1, DAT5
+ cmpeq DAT2, DAT6
+ bne 200f
+ tst OFF, #16
+ beq 105f
+ memcmp_process_head 1
+ sub N, N, #16
+ memcmp_process_tail
+105:
+.endm
+
+.macro memcmp_trailing_15bytes unaligned
+ movs N, N, lsl #29
+ .if unaligned
+ ldrcs DAT0, [S_1], #4
+ ldrcs DAT1, [S_1], #4
+ .else
+ ldmcsia S_1!, {DAT0, DAT1}
+ .endif
+ ldrmi DAT2, [S_1], #4
+ ldmcsia S_2!, {DAT4, DAT5}
+ ldrmi DAT6, [S_2], #4
+ movcc DAT0, #0
+ movcc DAT1, #0
+ movpl DAT2, #0
+ movcc DAT4, #0
+ movcc DAT5, #0
+ movpl DAT6, #0
+ cmp DAT0, DAT4
+ cmpeq DAT1, DAT5
+ cmpeq DAT2, DAT6
+ bne 200f
+ movs N, N, lsl #2
+ ldrcsh DAT0, [S_1], #2
+ ldrmib DAT1, [S_1]
+ ldrcsh DAT4, [S_2], #2
+ ldrmib DAT5, [S_2]
+ movcc DAT0, #0
+ movpl DAT1, #0
+ movcc DAT4, #0
+ movpl DAT5, #0
+ cmp DAT0, DAT4
+ cmpeq DAT1, DAT5
+ bne 200f
+.endm
+
+.macro memcmp_long_inner_loop unaligned
+110:
+ memcmp_process_head unaligned
+ pld [S_2, #prefetch_distance*32 + 16]
+ memcmp_process_tail
+ memcmp_process_head unaligned
+ pld [S_1, OFF]
+ memcmp_process_tail
+ subs N, N, #32
+ bhs 110b
+ /* Just before the final (prefetch_distance+1) 32-byte blocks,
+ * deal with final preloads */
+ preload_trailing 0, S_1, 5, N, DAT0
+ preload_trailing 0, S_2, 5, N, DAT0
+ add N, N, #(prefetch_distance+2)*32 - 16
+120:
+ memcmp_process_head unaligned
+ memcmp_process_tail
+ subs N, N, #16
+ bhs 120b
+ /* Trailing words and bytes */
+ tst N, #15
+ beq 199f
+ memcmp_trailing_15bytes unaligned
+199: /* Reached end without detecting a difference */
+ mov a1, #0
+ pop {DAT1-DAT6, pc}
+.endm
+
+.macro memcmp_short_inner_loop unaligned
+ subs N, N, #16 /* simplifies inner loop termination */
+ blo 122f
+120:
+ memcmp_process_head unaligned
+ memcmp_process_tail
+ subs N, N, #16
+ bhs 120b
+122: /* Trailing words and bytes */
+ tst N, #15
+ beq 199f
+ memcmp_trailing_15bytes unaligned
+199: /* Reached end without detecting a difference */
+ mov a1, #0
+ pop {DAT1-DAT6, pc}
+.endm
+
+/*
+ * int memcmp(const void *s1, const void *s2, size_t n);
+ * On entry:
+ * a1 = pointer to buffer 1
+ * a2 = pointer to buffer 2
+ * a3 = number of bytes to compare (as unsigned chars)
+ * On exit:
+ * a1 = >0/=0/<0 if s1 >/=/< s2
+ */
+
+.set prefetch_distance, 2
+
+myfunc memcmp
+ S_1 .req a1
+ S_2 .req a2
+ N .req a3
+ DAT0 .req a4
+ DAT1 .req v1
+ DAT2 .req v2
+ DAT3 .req v3
+ DAT4 .req v4
+ DAT5 .req v5
+ DAT6 .req v6
+ DAT7 .req ip
+ OFF .req lr
+
+ push {DAT1-DAT6, lr}
+
+ /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
+ cmp N, #(prefetch_distance+3)*32 - 1
+ blo 170f
+
+ /* Long case */
+ /* Adjust N so that the decrement instruction can also test for
+ * inner loop termination. We want it to stop when there are
+ * (prefetch_distance+1) complete blocks to go. */
+ sub N, N, #(prefetch_distance+2)*32
+ preload_leading_step1 0, DAT0, S_1, 5
+ preload_leading_step1 0, DAT1, S_2, 5
+ tst S_2, #31
+ beq 154f
+ rsb OFF, S_2, #0 /* no need to AND with 15 here */
+ preload_leading_step2 0, DAT0, S_1, 5, OFF, DAT2
+ preload_leading_step2 0, DAT1, S_2, 5, OFF, DAT2
+ memcmp_leading_31bytes
+154: /* Second source now cacheline (32-byte) aligned; we have at
+ * least one prefetch to go. */
+ /* Prefetch offset is best selected such that it lies in the
+ * first 8 of each 32 bytes - but it's just as easy to aim for
+ * the first one */
+ and OFF, S_1, #31
+ rsb OFF, OFF, #32*prefetch_distance
+ tst S_1, #3
+ bne 140f
+ memcmp_long_inner_loop 0
+140: memcmp_long_inner_loop 1
+
+170: /* Short case */
+ teq N, #0
+ beq 199f
+ preload_all 0, 0, 0, S_1, 5, N, DAT0, DAT1
+ preload_all 0, 0, 0, S_2, 5, N, DAT0, DAT1
+ tst S_2, #3
+ beq 174f
+172: subs N, N, #1
+ blo 199f
+ ldrb DAT0, [S_1], #1
+ ldrb DAT4, [S_2], #1
+ cmp DAT0, DAT4
+ bne 200f
+ tst S_2, #3
+ bne 172b
+174: /* Second source now 4-byte aligned; we have 0 or more bytes to go */
+ tst S_1, #3
+ bne 140f
+ memcmp_short_inner_loop 0
+140: memcmp_short_inner_loop 1
+
+200: /* Difference found: determine sign. */
+ rev DAT0, DAT0
+ rev DAT4, DAT4
+ rev DAT1, DAT1
+ rev DAT5, DAT5
+ rev DAT2, DAT2
+ rev DAT6, DAT6
+ rev DAT3, DAT3
+ rev DAT7, DAT7
+
+ cmp DAT0, DAT4
+ cmpeq DAT1, DAT5
+ cmpeq DAT2, DAT6
+ cmpeq DAT3, DAT7
+
+ movhi a1, #1
+ movlo a1, #-1
+ pop {DAT1-DAT6, pc}
+ .size memcmp,.-memcmp
+
+ .unreq S_1
+ .unreq S_2
+ .unreq N
+ .unreq DAT0
+ .unreq DAT1
+ .unreq DAT2
+ .unreq DAT3
+ .unreq DAT4
+ .unreq DAT5
+ .unreq DAT6
+ .unreq DAT7
+ .unreq OFF
+.endfunc
--- /dev/null
+/*
+Copyright (c) 2019, RISC OS Open Ltd
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the copyright holder nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "arm-mem.h"
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+ .text
+ .fpu neon
+ .arch armv7a
+ .object_arch armv4
+ .arm
+ .altmacro
+ .p2align 2
+
+ .altmacro
+
+/* Load 32 bytes from both buffers (8-byte aligned) post-incrementing the pointers
+ * r0q-r1q are unused, but retained so we have identical parameters to load_32b_x2_unaligned
+ * r0d-r3d are filled with data from S_1
+ * r4d-r7d are filled with data from S_2
+ * switch_loads indicates that we should re-order the loads to assist with scheduling a following pld
+ * I1-I8 are optional instructions to insert into stalls
+ */
+.macro load_32b_x2_aligned r0q, r1q, r0d, r1d, r2d, r3d, r4d, r5d, r6d, r7d, switch_loads, I1, I2, I3, I4, I5, I6, I7, I8
+ .if switch_loads == 1
+ vld1.32 {\r4d}, [S_2 :64]!
+ \I1
+ vld1.32 {\r0d}, [S_1 :64]!
+ \I2
+ vld1.32 {\r5d}, [S_2 :64]!
+ \I3
+ vld1.32 {\r1d}, [S_1 :64]!
+ \I4
+ vld1.32 {\r6d}, [S_2 :64]!
+ \I5
+ vld1.32 {\r2d}, [S_1 :64]!
+ \I6
+ vld1.32 {\r7d}, [S_2 :64]!
+ \I7
+ vld1.32 {\r3d}, [S_1 :64]!
+ \I8
+ .else
+ vld1.32 {\r0d}, [S_1 :64]!
+ \I1
+ vld1.32 {\r4d}, [S_2 :64]!
+ \I2
+ vld1.32 {\r1d}, [S_1 :64]!
+ \I3
+ vld1.32 {\r5d}, [S_2 :64]!
+ \I4
+ vld1.32 {\r2d}, [S_1 :64]!
+ \I5
+ vld1.32 {\r6d}, [S_2 :64]!
+ \I6
+ vld1.32 {\r3d}, [S_1 :64]!
+ \I7
+ vld1.32 {\r7d}, [S_2 :64]!
+ \I8
+ .endif
+.endm
+
+/* Load 32 bytes from both buffers (S_1 rounded up to 8-byte boundary, S_2 8-byte aligned), post-incrementing the pointers
+ * S_1A, S_2A are 8 bytes on from S_1, S_2
+ * SIXTEEN is constant #16
+ * r0q-r1q are Q-reg names for r0d-r3d
+ * r0d-r3d are filled with data from S_1
+ * r4d-r7d are filled with data from S_2
+ * switch_loads is ignored in this case
+ * I1-I8 are optional instructions to insert into stalls
+ * d2-d6 are used as temporaries
+ * d7 on entry and exit holds the content of aligned 8-byte block containing "true" value of S_1
+ * d8.u8[0] = - ((("true" S_1) & 7) * 8)
+ * d9.u8[0] = 64 + d8.u8[0]
+ */
+.macro load_32b_x2_unaligned r0q, r1q, r0d, r1d, r2d, r3d, r4d, r5d, r6d, r7d, switch_loads, I1, I2, I3, I4, I5, I6, I7, I8
+ vld1.32 {d4}, [S_1 :64], SIXTEEN
+ \I1
+ vld1.32 {d5}, [S_1A :64], SIXTEEN
+ vshl.u64 \r0d, d7, d8
+ vld1.32 {d6}, [S_1 :64], SIXTEEN
+ \I2
+ vld1.32 {d7}, [S_1A :64], SIXTEEN
+ vshl.u64 d2, d4, d9
+ vld1.32 {\r4d}, [S_2 :64], SIXTEEN
+ vshl.u64 \r1d, d4, d8
+ vld1.32 {\r5d}, [S_2A :64], SIXTEEN
+ vshl.u64 d3, d5, d9
+ vld1.32 {\r6d}, [S_2 :64], SIXTEEN
+ vshl.u64 \r2d, d5, d8
+ vld1.32 {\r7d}, [S_2A :64], SIXTEEN
+ vshl.u64 d4, d6, d9
+ vshl.u64 \r3d, d6, d8
+ vshl.u64 d5, d7, d9
+ vorr \r0q, q1
+ \I8
+ \I3
+ \I4
+ \I5
+ \I6
+ \I7
+ vorr \r1q, q2
+.endm
+
+.macro process_32b_blocks load_macro
+ // Process these as an odd number of 32-byte full blocks,
+ // then a partial block of up to 63 trailing bytes
+ cmp N, #32
+ sub N, #64
+ bmi 20f
+ \load_macro q8, q9, d16, d17, d18, d19, d20, d21, d22, d23, 0
+ veor.u8 q0, q8, q10
+ subs N, #32
+ veor.u8 q1, q9, q11
+ bmi 9f
+1: \load_macro q12, q13, d24, d25, d26, d27, d28, d29, d30, d31, 0, \
+ <vorr d0, d2>, \
+ <vorr d1, d3>, \
+ <vorr d0, d1>, \
+ <vmov TMP1, s0>, \
+ <vmov TMP2, s1>, \
+ <veor.u8 d0, d24, d28>, \
+ <veor.u8 d1, d25, d29>, \
+ <pld [S_1, #prefetch_distance]>
+ orrs RES, TMP1, TMP2
+ veor.u8 q1, q13, q15
+ bne 33f
+ \load_macro q8, q9, d16, d17, d18, d19, d20, d21, d22, d23, 1, \
+ <vorr d0, d2>, \
+ <vorr d1, d3>, \
+ <vorr d0, d1>, \
+ <vmov TMP1, s0>, \
+ <vmov TMP2, s1>, \
+ <veor.u8 d0, d16, d20>, \
+ <veor.u8 d1, d17, d21>, \
+ <pld [S_2, #prefetch_distance]>
+ orrs RES, TMP1, TMP2
+ veor.u8 q1, q9, q11
+ bne 31f
+ subs N, #64
+ bpl 1b
+9: vorr q0, q1
+ vorr d0, d1
+ vmov TMP1, s0
+ vmov TMP2, s1
+ orrs RES, TMP1, TMP2
+ bne 33f
+10: tst N, #32
+ beq 14f
+ \load_macro q8, q9, d16, d17, d18, d19, d20, d21, d22, d23, 0
+ veor.u8 q0, q8, q10
+ veor.u8 q1, q9, q11
+ vorr q0, q1
+ vorr d0, d1
+ vmov TMP1, s0
+ vmov TMP2, s1
+ orrs RES, TMP1, TMP2
+ bne 33f
+14:
+.endm
+
+/*
+ * int memcmp(const void *s1, const void *s2, size_t n);
+ * On entry:
+ * a1 = pointer to buffer 1
+ * a2 = pointer to buffer 2
+ * a3 = number of bytes to compare (as unsigned chars)
+ * On exit:
+ * a1 = >0/=0/<0 if s1 >/=/< s2
+ */
+
+.set prefetch_distance, 63
+
+myfunc memcmp
+ RES .req a1
+ S_2 .req a2
+ N .req a3
+ S_1 .req a4
+ S_1A .req v1
+ S_2A .req v2
+ SIXTEEN .req v3
+ TMP1 .req ip
+ TMP2 .req lr
+
+ // Based on real-world data, we are actually very likely to find a
+ // difference within the first few bytes, so it's unlikely to be
+ // beneficial to vectorise these. Test first 1+ bytes individually,
+ // stopping when we have at least the s2 pointer 8-byte aligned.
+ mov S_1, a1
+ and RES, S_2, #7
+ push {lr}
+ rsb RES, #7
+ subs N, #1
+ ldrcsb TMP2, [S_2], #1
+ ldrcsb TMP1, [S_1], #1
+ bcc 43f
+ cmp RES, N
+ movcs RES, N
+ teq RES, #0
+ beq 9f
+ sub N, RES
+1: cmp TMP1, TMP2
+ ldrb TMP1, [S_1], #1
+ bne 41f
+ ldrb TMP2, [S_2], #1
+ subs RES, #1
+ bne 1b
+9: cmp TMP1, TMP2
+ bne 41f
+ teq N, #0
+ beq 43f // because it's very common to have found a match by now
+
+ tst S_1, #7
+ bne 50f
+
+ // Both aligned
+ process_32b_blocks load_32b_x2_aligned
+ lsls N, #32-5
+ beq 43f
+ bpl 15f
+ vld1.32 {d16}, [S_1 :64]!
+ vld1.32 {d20}, [S_2 :64]!
+ vld1.32 {d17}, [S_1 :64]!
+ vld1.32 {d21}, [S_2 :64]!
+15: lsls N, #2
+ bcc 16f
+ vld1.32 {d18}, [S_1 :64]!
+ vld1.32 {d22}, [S_2 :64]!
+16: bpl 17f
+ vld1.32 {d19[0]}, [S_1 :32]!
+ vld1.32 {d23[0]}, [S_2 :32]!
+17: lsls N, #2
+ bcc 18f
+ vld1.16 {d19[2]}, [S_1 :16]!
+ vld1.16 {d23[2]}, [S_2 :16]!
+18: bpl 19f
+ vld1.8 {d19[6]}, [S_1]!
+ vld1.8 {d23[6]}, [S_2]!
+19: veor.u8 q0, q8, q10
+ veor.u8 q1, q9, q11
+ vorr q0, q1
+ vorr d0, d1
+ vmov TMP1, s0
+ vmov TMP2, s1
+ orrs RES, TMP1, TMP2
+ bne 33f
+ pop {pc}
+
+20: // Make both banks match so the holes between loads won't affect result
+ vmov q8, q10
+ vmov q9, q11
+ b 10b
+
+31: // Diff found in q12-q15
+ push {v1,v2}
+ vrev32.8 q0, q12
+ vrev32.8 q1, q14
+ vmov a1, a2, d0
+ vmov a3, a4, d2
+ vmov v1, v2, d1
+ vmov ip, lr, d3
+ cmp a3, a1
+ vrev32.8 q0, q13
+ cmpeq a4, a2
+ vrev32.8 q1, q15
+ cmpeq ip, v1
+ vmov a1, a2, d0
+ cmpeq lr, v2
+ vmov a3, a4, d2
+ movne RES, #1
+ vmov v1, v2, d1
+ bne 32f
+ vmov ip, lr, d3
+ cmp a3, a1
+ cmpeq a4, a2
+ mov RES, #1
+ cmpeq ip, v1
+ cmpeq lr, v2
+32: subcs RES, #2
+ pop {v1,v2,pc}
+
+33: // Diff found in q8-q11
+ push {v1,v2}
+ vrev32.8 q0, q8
+ vrev32.8 q1, q10
+ vmov a1, a2, d0
+ vmov a3, a4, d2
+ vmov v1, v2, d1
+ vmov ip, lr, d3
+ cmp a3, a1
+ vrev32.8 q0, q9
+ cmpeq a4, a2
+ vrev32.8 q1, q11
+ cmpeq ip, v1
+ vmov a1, a2, d0
+ cmpeq lr, v2
+ vmov a3, a4, d2
+ movne RES, #1
+ vmov v1, v2, d1
+ bne 34f
+ vmov ip, lr, d3
+ cmp a3, a1
+ cmpeq a4, a2
+ mov RES, #1
+ cmpeq ip, v1
+ cmpeq lr, v2
+34: subcs RES, #2
+ pop {v1,v2,pc}
+
+41: movcc RES, #-1
+ movcs RES, #1
+ pop {pc}
+
+43: mov RES, #0
+ pop {pc}
+
+
+50: // Only S_2 is aligned
+ push {v1-v3}
+ and v3, S_1, #7
+ bic S_1, #7
+ add S_1A, S_1, #16
+ add S_2A, S_2, #8
+ vpush {q4}
+ lsl v3, #3
+ rsb v3, #0
+ vld1.32 {d7}, [S_1 :64]!
+ vmov s16, v3
+ add v3, #64
+ vmov s18, v3
+ mov SIXTEEN, #16
+ process_32b_blocks load_32b_x2_unaligned
+ lsls N, #32-5
+ beq 43f
+ // Reapply the offset to S_1 and use unaligned loads from here on
+ vmov TMP1, s16
+ sub S_1, #8
+ sub S_1, TMP1, asr #3
+ bpl 15f
+ vld1.32 {d16}, [S_1]!
+ vld1.32 {d20}, [S_2 :64]!
+ vld1.32 {d17}, [S_1]!
+ vld1.32 {d21}, [S_2 :64]!
+15: lsls N, #2
+ bcc 16f
+ vld1.32 {d18}, [S_1]!
+ vld1.32 {d22}, [S_2 :64]!
+16: bpl 17f
+ vld1.32 {d19[0]}, [S_1]!
+ vld1.32 {d23[0]}, [S_2 :32]!
+17: lsls N, #2
+ bcc 18f
+ vld1.16 {d19[2]}, [S_1]!
+ vld1.16 {d23[2]}, [S_2 :16]!
+18: bpl 19f
+ vld1.8 {d19[6]}, [S_1]!
+ vld1.8 {d23[6]}, [S_2]!
+19: veor.u8 q0, q8, q10
+ veor.u8 q1, q9, q11
+ vorr q0, q1
+ vorr d0, d1
+ vmov TMP1, s0
+ vmov TMP2, s1
+ orrs RES, TMP1, TMP2
+ bne 33f
+ vpop {q4}
+ pop {v1-v3,pc}
+
+20: // Make both banks match so the holes between loads won't affect result
+ vmov q8, q10
+ vmov q9, q11
+ b 10b
+
+31: // Diff found in q12-q15
+ vrev32.8 q0, q12
+ vrev32.8 q1, q14
+ vmov a1, a2, d0
+ vmov a3, a4, d2
+ vmov v1, v2, d1
+ vmov ip, lr, d3
+ cmp a3, a1
+ vrev32.8 q0, q13
+ cmpeq a4, a2
+ vrev32.8 q1, q15
+ cmpeq ip, v1
+ vmov a1, a2, d0
+ cmpeq lr, v2
+ vmov a3, a4, d2
+ movne RES, #1
+ vmov v1, v2, d1
+ bne 32f
+ vmov ip, lr, d3
+ cmp a3, a1
+ cmpeq a4, a2
+ mov RES, #1
+ cmpeq ip, v1
+ cmpeq lr, v2
+32: vpop {q4}
+ subcs RES, #2
+ pop {v1-v3,pc}
+
+33: // Diff found in q8-q11
+ vrev32.8 q0, q8
+ vrev32.8 q1, q10
+ vmov a1, a2, d0
+ vmov a3, a4, d2
+ vmov v1, v2, d1
+ vmov ip, lr, d3
+ cmp a3, a1
+ vrev32.8 q0, q9
+ cmpeq a4, a2
+ vrev32.8 q1, q11
+ cmpeq ip, v1
+ vmov a1, a2, d0
+ cmpeq lr, v2
+ vmov a3, a4, d2
+ movne RES, #1
+ vmov v1, v2, d1
+ bne 34f
+ vmov ip, lr, d3
+ cmp a3, a1
+ cmpeq a4, a2
+ mov RES, #1
+ cmpeq ip, v1
+ cmpeq lr, v2
+34: vpop {q4}
+ subcs RES, #2
+ pop {v1-v3,pc}
+
+43: vpop {q4}
+ mov RES, #0
+ pop {v1-v3,pc}
+ .size memcmp,.-memcmp
--- /dev/null
+/*
+Copyright (c) 2013, Raspberry Pi Foundation
+Copyright (c) 2013, RISC OS Open Ltd
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the copyright holder nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "arm-mem.h"
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+ .text
+ .arch armv6
+ .object_arch armv4
+ .arm
+ .altmacro
+ .p2align 2
+
+.macro unaligned_words backwards, align, use_pld, words, r0, r1, r2, r3, r4, r5, r6, r7, r8
+ .if words == 1
+ .if backwards
+ mov r1, r0, lsl #32-align*8
+ ldr r0, [S, #-4]!
+ orr r1, r1, r0, lsr #align*8
+ str r1, [D, #-4]!
+ .else
+ mov r0, r1, lsr #align*8
+ ldr r1, [S, #4]!
+ orr r0, r0, r1, lsl #32-align*8
+ str r0, [D], #4
+ .endif
+ .elseif words == 2
+ .if backwards
+ ldr r1, [S, #-4]!
+ mov r2, r0, lsl #32-align*8
+ ldr r0, [S, #-4]!
+ orr r2, r2, r1, lsr #align*8
+ mov r1, r1, lsl #32-align*8
+ orr r1, r1, r0, lsr #align*8
+ stmdb D!, {r1, r2}
+ .else
+ ldr r1, [S, #4]!
+ mov r0, r2, lsr #align*8
+ ldr r2, [S, #4]!
+ orr r0, r0, r1, lsl #32-align*8
+ mov r1, r1, lsr #align*8
+ orr r1, r1, r2, lsl #32-align*8
+ stmia D!, {r0, r1}
+ .endif
+ .elseif words == 4
+ .if backwards
+ ldmdb S!, {r2, r3}
+ mov r4, r0, lsl #32-align*8
+ ldmdb S!, {r0, r1}
+ orr r4, r4, r3, lsr #align*8
+ mov r3, r3, lsl #32-align*8
+ orr r3, r3, r2, lsr #align*8
+ mov r2, r2, lsl #32-align*8
+ orr r2, r2, r1, lsr #align*8
+ mov r1, r1, lsl #32-align*8
+ orr r1, r1, r0, lsr #align*8
+ stmdb D!, {r1, r2, r3, r4}
+ .else
+ ldmib S!, {r1, r2}
+ mov r0, r4, lsr #align*8
+ ldmib S!, {r3, r4}
+ orr r0, r0, r1, lsl #32-align*8
+ mov r1, r1, lsr #align*8
+ orr r1, r1, r2, lsl #32-align*8
+ mov r2, r2, lsr #align*8
+ orr r2, r2, r3, lsl #32-align*8
+ mov r3, r3, lsr #align*8
+ orr r3, r3, r4, lsl #32-align*8
+ stmia D!, {r0, r1, r2, r3}
+ .endif
+ .elseif words == 8
+ .if backwards
+ ldmdb S!, {r4, r5, r6, r7}
+ mov r8, r0, lsl #32-align*8
+ ldmdb S!, {r0, r1, r2, r3}
+ .if use_pld
+ pld [S, OFF]
+ .endif
+ orr r8, r8, r7, lsr #align*8
+ mov r7, r7, lsl #32-align*8
+ orr r7, r7, r6, lsr #align*8
+ mov r6, r6, lsl #32-align*8
+ orr r6, r6, r5, lsr #align*8
+ mov r5, r5, lsl #32-align*8
+ orr r5, r5, r4, lsr #align*8
+ mov r4, r4, lsl #32-align*8
+ orr r4, r4, r3, lsr #align*8
+ mov r3, r3, lsl #32-align*8
+ orr r3, r3, r2, lsr #align*8
+ mov r2, r2, lsl #32-align*8
+ orr r2, r2, r1, lsr #align*8
+ mov r1, r1, lsl #32-align*8
+ orr r1, r1, r0, lsr #align*8
+ stmdb D!, {r5, r6, r7, r8}
+ stmdb D!, {r1, r2, r3, r4}
+ .else
+ ldmib S!, {r1, r2, r3, r4}
+ mov r0, r8, lsr #align*8
+ ldmib S!, {r5, r6, r7, r8}
+ .if use_pld
+ pld [S, OFF]
+ .endif
+ orr r0, r0, r1, lsl #32-align*8
+ mov r1, r1, lsr #align*8
+ orr r1, r1, r2, lsl #32-align*8
+ mov r2, r2, lsr #align*8
+ orr r2, r2, r3, lsl #32-align*8
+ mov r3, r3, lsr #align*8
+ orr r3, r3, r4, lsl #32-align*8
+ mov r4, r4, lsr #align*8
+ orr r4, r4, r5, lsl #32-align*8
+ mov r5, r5, lsr #align*8
+ orr r5, r5, r6, lsl #32-align*8
+ mov r6, r6, lsr #align*8
+ orr r6, r6, r7, lsl #32-align*8
+ mov r7, r7, lsr #align*8
+ orr r7, r7, r8, lsl #32-align*8
+ stmia D!, {r0, r1, r2, r3}
+ stmia D!, {r4, r5, r6, r7}
+ .endif
+ .endif
+.endm
+
+.macro memcpy_leading_15bytes backwards, align
+ movs DAT1, DAT2, lsl #31
+ sub N, N, DAT2
+ .if backwards
+ ldrmib DAT0, [S, #-1]!
+ ldrcsh DAT1, [S, #-2]!
+ strmib DAT0, [D, #-1]!
+ strcsh DAT1, [D, #-2]!
+ .else
+ ldrmib DAT0, [S], #1
+ ldrcsh DAT1, [S], #2
+ strmib DAT0, [D], #1
+ strcsh DAT1, [D], #2
+ .endif
+ movs DAT1, DAT2, lsl #29
+ .if backwards
+ ldrmi DAT0, [S, #-4]!
+ .if align == 0
+ ldmcsdb S!, {DAT1, DAT2}
+ .else
+ ldrcs DAT2, [S, #-4]!
+ ldrcs DAT1, [S, #-4]!
+ .endif
+ strmi DAT0, [D, #-4]!
+ stmcsdb D!, {DAT1, DAT2}
+ .else
+ ldrmi DAT0, [S], #4
+ .if align == 0
+ ldmcsia S!, {DAT1, DAT2}
+ .else
+ ldrcs DAT1, [S], #4
+ ldrcs DAT2, [S], #4
+ .endif
+ strmi DAT0, [D], #4
+ stmcsia D!, {DAT1, DAT2}
+ .endif
+.endm
+
+.macro memcpy_trailing_15bytes backwards, align
+ movs N, N, lsl #29
+ .if backwards
+ .if align == 0
+ ldmcsdb S!, {DAT0, DAT1}
+ .else
+ ldrcs DAT1, [S, #-4]!
+ ldrcs DAT0, [S, #-4]!
+ .endif
+ ldrmi DAT2, [S, #-4]!
+ stmcsdb D!, {DAT0, DAT1}
+ strmi DAT2, [D, #-4]!
+ .else
+ .if align == 0
+ ldmcsia S!, {DAT0, DAT1}
+ .else
+ ldrcs DAT0, [S], #4
+ ldrcs DAT1, [S], #4
+ .endif
+ ldrmi DAT2, [S], #4
+ stmcsia D!, {DAT0, DAT1}
+ strmi DAT2, [D], #4
+ .endif
+ movs N, N, lsl #2
+ .if backwards
+ ldrcsh DAT0, [S, #-2]!
+ ldrmib DAT1, [S, #-1]
+ strcsh DAT0, [D, #-2]!
+ strmib DAT1, [D, #-1]
+ .else
+ ldrcsh DAT0, [S], #2
+ ldrmib DAT1, [S]
+ strcsh DAT0, [D], #2
+ strmib DAT1, [D]
+ .endif
+.endm
+
+.macro memcpy_long_inner_loop backwards, align
+ .if align != 0
+ .if backwards
+ ldr DAT0, [S, #-align]!
+ .else
+ ldr LAST, [S, #-align]!
+ .endif
+ .endif
+110:
+ .if align == 0
+ .if backwards
+ ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
+ pld [S, OFF]
+ stmdb D!, {DAT4, DAT5, DAT6, LAST}
+ stmdb D!, {DAT0, DAT1, DAT2, DAT3}
+ .else
+ ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
+ pld [S, OFF]
+ stmia D!, {DAT0, DAT1, DAT2, DAT3}
+ stmia D!, {DAT4, DAT5, DAT6, LAST}
+ .endif
+ .else
+ unaligned_words backwards, align, 1, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST
+ .endif
+ subs N, N, #32
+ bhs 110b
+ /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
+ preload_trailing backwards, S, 5, N, OFF
+ add N, N, #(prefetch_distance+2)*32 - 32
+120:
+ .if align == 0
+ .if backwards
+ ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
+ stmdb D!, {DAT4, DAT5, DAT6, LAST}
+ stmdb D!, {DAT0, DAT1, DAT2, DAT3}
+ .else
+ ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
+ stmia D!, {DAT0, DAT1, DAT2, DAT3}
+ stmia D!, {DAT4, DAT5, DAT6, LAST}
+ .endif
+ .else
+ unaligned_words backwards, align, 0, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST
+ .endif
+ subs N, N, #32
+ bhs 120b
+ tst N, #16
+ .if align == 0
+ .if backwards
+ ldmnedb S!, {DAT0, DAT1, DAT2, LAST}
+ stmnedb D!, {DAT0, DAT1, DAT2, LAST}
+ .else
+ ldmneia S!, {DAT0, DAT1, DAT2, LAST}
+ stmneia D!, {DAT0, DAT1, DAT2, LAST}
+ .endif
+ .else
+ beq 130f
+ unaligned_words backwards, align, 0, 4, DAT0, DAT1, DAT2, DAT3, LAST
+130:
+ .endif
+ /* Trailing words and bytes */
+ tst N, #15
+ beq 199f
+ .if align != 0
+ add S, S, #align
+ .endif
+ memcpy_trailing_15bytes backwards, align
+199:
+ pop {DAT3, DAT4, DAT5, DAT6, DAT7}
+ pop {D, DAT1, DAT2, pc}
+.endm
+
+.macro memcpy_medium_inner_loop backwards, align
+120:
+ .if backwards
+ .if align == 0
+ ldmdb S!, {DAT0, DAT1, DAT2, LAST}
+ .else
+ ldr LAST, [S, #-4]!
+ ldr DAT2, [S, #-4]!
+ ldr DAT1, [S, #-4]!
+ ldr DAT0, [S, #-4]!
+ .endif
+ stmdb D!, {DAT0, DAT1, DAT2, LAST}
+ .else
+ .if align == 0
+ ldmia S!, {DAT0, DAT1, DAT2, LAST}
+ .else
+ ldr DAT0, [S], #4
+ ldr DAT1, [S], #4
+ ldr DAT2, [S], #4
+ ldr LAST, [S], #4
+ .endif
+ stmia D!, {DAT0, DAT1, DAT2, LAST}
+ .endif
+ subs N, N, #16
+ bhs 120b
+ /* Trailing words and bytes */
+ tst N, #15
+ beq 199f
+ memcpy_trailing_15bytes backwards, align
+199:
+ pop {D, DAT1, DAT2, pc}
+.endm
+
+.macro memcpy_short_inner_loop backwards, align
+ tst N, #16
+ .if backwards
+ .if align == 0
+ ldmnedb S!, {DAT0, DAT1, DAT2, LAST}
+ .else
+ ldrne LAST, [S, #-4]!
+ ldrne DAT2, [S, #-4]!
+ ldrne DAT1, [S, #-4]!
+ ldrne DAT0, [S, #-4]!
+ .endif
+ stmnedb D!, {DAT0, DAT1, DAT2, LAST}
+ .else
+ .if align == 0
+ ldmneia S!, {DAT0, DAT1, DAT2, LAST}
+ .else
+ ldrne DAT0, [S], #4
+ ldrne DAT1, [S], #4
+ ldrne DAT2, [S], #4
+ ldrne LAST, [S], #4
+ .endif
+ stmneia D!, {DAT0, DAT1, DAT2, LAST}
+ .endif
+ memcpy_trailing_15bytes backwards, align
+199:
+ pop {D, DAT1, DAT2, pc}
+.endm
+
+.macro memcpy backwards
+ D .req a1
+ S .req a2
+ N .req a3
+ DAT0 .req a4
+ DAT1 .req v1
+ DAT2 .req v2
+ DAT3 .req v3
+ DAT4 .req v4
+ DAT5 .req v5
+ DAT6 .req v6
+ DAT7 .req sl
+ LAST .req ip
+ OFF .req lr
+
+ .cfi_startproc
+
+ push {D, DAT1, DAT2, lr}
+
+ .cfi_def_cfa_offset 16
+ .cfi_rel_offset D, 0
+ .cfi_undefined S
+ .cfi_undefined N
+ .cfi_undefined DAT0
+ .cfi_rel_offset DAT1, 4
+ .cfi_rel_offset DAT2, 8
+ .cfi_undefined LAST
+ .cfi_rel_offset lr, 12
+
+ .if backwards
+ add D, D, N
+ add S, S, N
+ .endif
+
+ /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */
+ cmp N, #31
+ blo 170f
+ /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
+ cmp N, #(prefetch_distance+3)*32 - 1
+ blo 160f
+
+ /* Long case */
+ push {DAT3, DAT4, DAT5, DAT6, DAT7}
+
+ .cfi_def_cfa_offset 36
+ .cfi_rel_offset D, 20
+ .cfi_rel_offset DAT1, 24
+ .cfi_rel_offset DAT2, 28
+ .cfi_rel_offset DAT3, 0
+ .cfi_rel_offset DAT4, 4
+ .cfi_rel_offset DAT5, 8
+ .cfi_rel_offset DAT6, 12
+ .cfi_rel_offset DAT7, 16
+ .cfi_rel_offset lr, 32
+
+ /* Adjust N so that the decrement instruction can also test for
+ * inner loop termination. We want it to stop when there are
+ * (prefetch_distance+1) complete blocks to go. */
+ sub N, N, #(prefetch_distance+2)*32
+ preload_leading_step1 backwards, DAT0, S, 5
+ .if backwards
+ /* Bug in GAS: it accepts, but mis-assembles the instruction
+ * ands DAT2, D, #60, 2
+ * which sets DAT2 to the number of leading bytes until destination is aligned and also clears C (sets borrow)
+ */
+ .word 0xE210513C
+ beq 154f
+ .else
+ ands DAT2, D, #15
+ beq 154f
+ rsb DAT2, DAT2, #16 /* number of leading bytes until destination aligned */
+ .endif
+ preload_leading_step2 backwards, DAT0, S, 5, DAT2, OFF
+ memcpy_leading_15bytes backwards, 1
+154: /* Destination now 16-byte aligned; we have at least one prefetch as well as at least one 16-byte output block */
+ /* Prefetch offset is best selected such that it lies in the first 8 of each 32 bytes - but it's just as easy to aim for the first one */
+ .if backwards
+ rsb OFF, S, #3
+ and OFF, OFF, #28
+ sub OFF, OFF, #32*(prefetch_distance+1)
+ .else
+ and OFF, S, #28
+ rsb OFF, OFF, #32*prefetch_distance
+ .endif
+ movs DAT0, S, lsl #31
+ bhi 157f
+ bcs 156f
+ bmi 155f
+ memcpy_long_inner_loop backwards, 0
+155: memcpy_long_inner_loop backwards, 1
+156: memcpy_long_inner_loop backwards, 2
+157: memcpy_long_inner_loop backwards, 3
+
+ .cfi_def_cfa_offset 16
+ .cfi_rel_offset D, 0
+ .cfi_rel_offset DAT1, 4
+ .cfi_rel_offset DAT2, 8
+ .cfi_same_value DAT3
+ .cfi_same_value DAT4
+ .cfi_same_value DAT5
+ .cfi_same_value DAT6
+ .cfi_same_value DAT7
+ .cfi_rel_offset lr, 12
+
+160: /* Medium case */
+ preload_all backwards, 0, 0, S, 5, N, DAT2, OFF
+ sub N, N, #16 /* simplifies inner loop termination */
+ .if backwards
+ ands DAT2, D, #15
+ beq 164f
+ .else
+ ands DAT2, D, #15
+ beq 164f
+ rsb DAT2, DAT2, #16
+ .endif
+ memcpy_leading_15bytes backwards, align
+164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */
+ tst S, #3
+ bne 140f
+ memcpy_medium_inner_loop backwards, 0
+140: memcpy_medium_inner_loop backwards, 1
+
+170: /* Short case, less than 31 bytes, so no guarantee of at least one 16-byte block */
+ teq N, #0
+ beq 199f
+ preload_all backwards, 1, 0, S, 5, N, DAT2, LAST
+ tst D, #3
+ beq 174f
+172: subs N, N, #1
+ blo 199f
+ .if backwards
+ ldrb DAT0, [S, #-1]!
+ strb DAT0, [D, #-1]!
+ .else
+ ldrb DAT0, [S], #1
+ strb DAT0, [D], #1
+ .endif
+ tst D, #3
+ bne 172b
+174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
+ tst S, #3
+ bne 140f
+ memcpy_short_inner_loop backwards, 0
+140: memcpy_short_inner_loop backwards, 1
+
+ .cfi_endproc
+
+ .unreq D
+ .unreq S
+ .unreq N
+ .unreq DAT0
+ .unreq DAT1
+ .unreq DAT2
+ .unreq DAT3
+ .unreq DAT4
+ .unreq DAT5
+ .unreq DAT6
+ .unreq DAT7
+ .unreq LAST
+ .unreq OFF
+.endm
+
+/*
+ * void *memcpy(void * restrict s1, const void * restrict s2, size_t n);
+ * On entry:
+ * a1 = pointer to destination
+ * a2 = pointer to source
+ * a3 = number of bytes to copy
+ * On exit:
+ * a1 preserved
+ */
+
+.set prefetch_distance, 3
+
+myfunc memcpy
+1000: memcpy 0
+ .size memcpy,.-memcpy
+.endfunc
+
+/*
+ * void *memmove(void *s1, const void *s2, size_t n);
+ * On entry:
+ * a1 = pointer to destination
+ * a2 = pointer to source
+ * a3 = number of bytes to copy
+ * On exit:
+ * a1 preserved
+ */
+
+.set prefetch_distance, 3
+
+myfunc memmove
+ cmp a2, a1
+ bpl 1000b /* pl works even over -1 - 0 and 0x7fffffff - 0x80000000 boundaries */
+ memcpy 1
+ .size memmove,.-memmove
+.endfunc
+
+/*
+ * void *mempcpy(void * restrict s1, const void * restrict s2, size_t n);
+ * On entry:
+ * a1 = pointer to destination
+ * a2 = pointer to source
+ * a3 = number of bytes to copy
+ * On exit:
+ * a1 = pointer to immediately after destination block
+ */
+
+myfunc mempcpy
+.global __mempcpy
+.type __mempcpy STT_FUNC
+__mempcpy:
+ push {v1, lr}
+ mov v1, a3
+ bl 1000b
+ add a1, a1, v1
+ pop {v1, pc}
+ .size mempcpy,.-mempcpy
+ .size __mempcpy,.-__mempcpy
--- /dev/null
+/*
+Copyright (c) 2015, RISC OS Open Ltd
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the copyright holder nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "arm-mem.h"
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+ .text
+ .fpu neon
+ .arch armv7a
+ .object_arch armv4
+ .arm
+ .altmacro
+ .p2align 2
+
+.macro memcpy_leading_63bytes backwards, align
+ movs TMP, LEAD, lsl #31
+ bpl 1f
+ .if backwards
+ sub S, S, #1
+ sub D, D, #1
+ vld1.8 {d7[7]}, [S]
+ vst1.8 {d7[7]}, [D]
+ .else
+ vld1.8 {d7[7]}, [S]!
+ vst1.8 {d7[7]}, [D]!
+ .endif
+1: bcc 1f
+ .if backwards
+ .if align == 0 || align == 2
+ sub S, S, #2
+ sub D, D, #2
+ vld1.16 {d7[3]}, [S :16]
+ .else
+ sub S, S, #1
+ sub D, D, #2
+ vld1.8 {d7[7]}, [S]
+ sub S, S, #1
+ vld1.8 {d7[6]}, [S]
+ .endif
+ vst1.16 {d7[3]}, [D :16]
+ .else
+ .if align == 0 || align == 2
+ vld1.16 {d7[3]}, [S :16]!
+ .else
+ vld1.8 {d7[6]}, [S]!
+ vld1.8 {d7[7]}, [S]!
+ .endif
+ vst1.16 {d7[3]}, [D :16]!
+ .endif
+1:
+ .if align == 0
+ movs TMP, LEAD, lsl #29
+ .if backwards
+ vldmdbmi S!, {s13}
+ vldmdbcs S!, {d7}
+ vstmdbmi D!, {s13}
+ vstmdbcs D!, {d7}
+ .else
+ vldmiami S!, {s13}
+ vldmiacs S!, {d7}
+ vstmiami D!, {s13}
+ vstmiacs D!, {d7}
+ .endif
+ movs TMP, LEAD, lsl #27
+ .if backwards
+ vldmdbmi S!, {d2-d3}
+ vldmdbcs S!, {d4-d7}
+ vstmdbmi D!, {d2-d3}
+ vstmdbcs D!, {d4-d7}
+ .else
+ vldmiami S!, {d2-d3}
+ vldmiacs S!, {d4-d7}
+ vstmiami D!, {d2-d3}
+ vstmiacs D!, {d4-d7}
+ .endif
+ .else
+ .if backwards
+ add S, S, #4-align
+ vldmdb S!, {s0}
+ .else
+ sub S, S, #align
+ vldmia S!, {s19}
+ .endif
+ movs TMP, LEAD, lsl #29
+ bpl 1f
+ .if backwards
+ vmov s1, s0
+ vldmdb S!, {s0}
+ vext.8 d1, d0, d1, #align
+ vstmdb D!, {s2}
+ .else
+ vmov s18, s19
+ vldmia S!, {s19}
+ vext.8 d8, d9, d10, #align
+ vstmia D!, {s16}
+ .endif
+1: bcc 1f
+ .if backwards
+ vmov s2, s0
+ vldmdb S!, {d0}
+ vext.8 d1, d0, d1, #align
+ vstmdb D!, {d1}
+ .else
+ vmov s17, s19
+ vldmia S!, {d9}
+ vext.8 d8, d8, d9, #4+align
+ vstmia D!, {d8}
+ .endif
+1: movs TMP, LEAD, lsl #27
+ bpl 1f
+ .if backwards
+ vmov s4, s0
+ vldmdb S!, {d0-d1}
+ vext.8 q1, q0, q1, #align
+ vstmdb D!, {d2-d3}
+ .else
+ vmov s15, s19
+ vldmia S!, {d8-d9}
+ vext.8 q3, q3, q4, #12+align
+ vstmia D!, {d6-d7}
+ .endif
+1: bcc 1f
+ .if backwards
+ vmov s8, s0
+ vldmdb S!, {d0-d3}
+ vext.8 q2, q1, q2, #align
+ vext.8 q1, q0, q1, #align
+ vstmdb D!, {d2-d5}
+ .else
+ vmov s11, s19
+ vldmia S!, {d6-d9}
+ vext.8 q2, q2, q3, #12+align
+ vext.8 q3, q3, q4, #12+align
+ vstmia D!, {d4-d7}
+ .endif
+1:
+ .endif
+.endm
+
+.macro memcpy_middle_64bytes backwards, align, use_pld, add_nops
+ .if align == 0
+ .if backwards
+ vldmdb S!, {d0-d7}
+ .if use_pld
+ pld [S, OFF]
+ .endif
+ vstmdb D!, {d0-d7}
+ .else
+ vldmia S!, {d0-d7}
+ .if add_nops
+ .rept 14
+ nop
+ .endr
+ .endif
+ .if use_pld
+ pld [S, OFF]
+ .endif
+ vstmia D!, {d0-d7}
+ .if add_nops
+ .rept 7
+ nop
+ .endr
+ .endif
+ .endif
+ .else
+ .if backwards
+ vmov s16, s0
+ vldmdb S!, {d0-d7}
+ .if use_pld
+ pld [S, OFF]
+ .endif
+ vext.8 q4, q3, q4, #align
+ vext.8 q3, q2, q3, #align
+ vext.8 q2, q1, q2, #align
+ vext.8 q1, q0, q1, #align
+ vstmdb D!, {d2-d9}
+ .else
+ vmov s3, s19
+ vldmia S!, {d2-d9}
+ .if add_nops
+ .rept 7
+ nop
+ .endr
+ .endif
+ .if use_pld
+ pld [S, OFF]
+ .endif
+ vext.8 q0, q0, q1, #12+align
+ vext.8 q1, q1, q2, #12+align
+ vext.8 q2, q2, q3, #12+align
+ vext.8 q3, q3, q4, #12+align
+ .if add_nops
+ nop
+ nop
+ nop
+ .endif
+ vstmia D!, {d0-d7}
+ .if add_nops
+ nop
+ nop
+ .endif
+ .endif
+ .endif
+.endm
+
+.macro memcpy_trailing_63bytes backwards, align
+ movs TMP, N, lsl #27
+ .if align == 0
+ .if backwards
+ vldmdbcs S!, {d4-d7}
+ vldmdbmi S!, {d2-d3}
+ vstmdbcs D!, {d4-d7}
+ vstmdbmi D!, {d2-d3}
+ .else
+ vldmiacs S!, {d4-d7}
+ vldmiami S!, {d2-d3}
+ vstmiacs D!, {d4-d7}
+ vstmiami D!, {d2-d3}
+ .endif
+ movs TMP, N, lsl #29
+ .if backwards
+ vldmdbcs S!, {d7}
+ vldmdbmi S!, {s13}
+ vstmdbcs D!, {d7}
+ vstmdbmi D!, {s13}
+ .else
+ vldmiacs S!, {d7}
+ vldmiami S!, {s13}
+ vstmiacs D!, {d7}
+ vstmiami D!, {s13}
+ .endif
+ .else
+ bcc 1f
+ .if backwards
+ vmov s8, s0
+ vldmdb S!, {d0-d3}
+ vext.8 q2, q1, q2, #align
+ vext.8 q1, q0, q1, #align
+ vstmdb D!, {d2-d5}
+ .else
+ vmov s11, s19
+ vldmia S!, {d6-d9}
+ vext.8 q2, q2, q3, #12+align
+ vext.8 q3, q3, q4, #12+align
+ vstmia D!, {d4-d7}
+ .endif
+1: bpl 1f
+ .if backwards
+ vmov s4, s0
+ vldmdb S!, {d0-d1}
+ vext.8 q1, q0, q1, #align
+ vstmdb D!, {d2-d3}
+ .else
+ vmov s15, s19
+ vldmia S!, {d8-d9}
+ vext.8 q3, q3, q4, #12+align
+ vstmia D!, {d6-d7}
+ .endif
+1: movs TMP, N, lsl #29
+ bcc 1f
+ .if backwards
+ vmov s2, s0
+ vldmdb S!, {d0}
+ vext.8 d1, d0, d1, #align
+ vstmdb D!, {d1}
+ .else
+ vmov s17, s19
+ vldmia S!, {d9}
+ vext.8 d8, d8, d9, #4+align
+ vstmia D!, {d8}
+ .endif
+1: bpl 1f
+ .if backwards
+ vmov s1, s0
+ vldmdb S!, {s0}
+ vext.8 d1, d0, d1, #align
+ vstmdb D!, {s2}
+1: add S, S, #align
+ .else
+ vmov s18, s19
+ vldmia S!, {s19}
+ vext.8 d8, d9, d10, #align
+ vstmia D!, {s16}
+1: sub S, S, #4-align
+ .endif
+ .endif
+ movs TMP, N, lsl #31
+ bcc 1f
+ .if backwards
+ .if align == 0 || align == 2
+ sub S, S, #2
+ sub D, D, #2
+ vld1.16 {d7[3]}, [S :16]
+ .else
+ sub S, S, #1
+ sub D, D, #2
+ vld1.8 {d7[7]}, [S]
+ sub S, S, #1
+ vld1.8 {d7[6]}, [S]
+ .endif
+ vst1.16 {d7[3]}, [D :16]
+ .else
+ .if align == 0 || align == 2
+ vld1.16 {d7[3]}, [S :16]!
+ .else
+ vld1.8 {d7[6]}, [S]!
+ vld1.8 {d7[7]}, [S]!
+ .endif
+ vst1.16 {d7[3]}, [D :16]!
+ .endif
+1: bpl 1f
+ .if backwards
+ sub S, S, #1
+ sub D, D, #1
+ vld1.8 {d7[7]}, [S]
+ vst1.8 {d7[7]}, [D]
+ .else
+ vld1.8 {d7[7]}, [S]!
+ vst1.8 {d7[7]}, [D]!
+ .endif
+1:
+.endm
+
+.macro memcpy_long_inner_loop backwards, align, add_nops
+ .if backwards
+ /* Bug in GAS: it accepts, but mis-assembles the instruction
+ * ands LEAD, D, #252, 2
+ * which sets LEAD to the number of leading bytes until destination is aligned and also clears C (sets borrow)
+ */
+ .word 0xE210C1FC
+ beq 154f
+ .else
+ ands LEAD, D, #63
+ beq 154f
+ rsb LEAD, LEAD, #64 /* number of leading bytes until destination aligned */
+ .endif
+ preload_leading_step2 backwards, P, S, 6, LEAD, TMP
+ memcpy_leading_63bytes backwards, align
+ sub N, N, LEAD
+ .if align != 0
+ b 155f
+ .endif
+154:
+ .if align != 0
+ .if backwards
+ add S, S, #4-align
+ vldmdb S!, {s0}
+ .else
+ sub S, S, #align
+ vldmia S!, {s19}
+ .endif
+ .endif
+155: /* Destination now 64-byte aligned; we have at least one prefetch as well as at least one 64-byte output block */
+ /* Prefetch offset is best selected such that it lies in the first 16 of each 64 bytes - but it's just as easy to aim for the first one */
+ .if backwards
+ rsb OFF, S, #0
+ and OFF, OFF, #60
+ sub OFF, OFF, #64*(prefetch_distance+1)
+ .else
+ and OFF, S, #60
+ rsb OFF, OFF, #64*prefetch_distance
+ .endif
+110: memcpy_middle_64bytes backwards, align, 1, add_nops
+ subs N, N, #64
+ bhs 110b
+ /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
+ preload_trailing backwards, S, 6, N, OFF
+ add N, N, #(prefetch_distance+2)*64 - 64
+120: memcpy_middle_64bytes backwards, align, 0, add_nops
+ subs N, N, #64
+ bhs 120b
+ /* Trailing words and bytes */
+ tst N, #63
+ beq 199f
+ memcpy_trailing_63bytes backwards, align
+199:
+ vpop {d8-d9}
+ pop {a1,pc}
+.endm
+
+.macro memcpy_medium_inner_loop backwards, align
+ .if backwards
+ ands LEAD, D, #63
+ beq 164f
+ .else
+ ands LEAD, D, #63
+ beq 164f
+ rsb LEAD, LEAD, #64
+ .endif
+ memcpy_leading_63bytes backwards, align
+ sub N, N, LEAD
+ .if align != 0
+ b 165f
+ .endif
+164:
+ .if align != 0
+ .if backwards
+ add S, S, #4-align
+ vldmdb S!, {s0}
+ .else
+ sub S, S, #align
+ vldmia S!, {s19}
+ .endif
+ .endif
+165: /* Destination now 64-byte aligned */
+ subs N, N, #64
+ blo 129f
+120: memcpy_middle_64bytes backwards, align, 0, 0
+ subs N, N, #64
+ bhs 120b
+129: /* Trailing words and bytes */
+ tst N, #63
+ beq 199f
+ memcpy_trailing_63bytes backwards, align
+199:
+ vpop {d8-d9}
+ pop {a1,pc}
+.endm
+
+.macro memcpy_short_inner_loop backwards, align
+ .if align != 0
+ .if backwards
+ add S, S, #4-align
+ vldmdb S!, {s0}
+ .else
+ sub S, S, #align
+ vldmia S!, {s19}
+ .endif
+ .endif
+ memcpy_trailing_63bytes backwards, align
+199:
+ vpop {d8-d9}
+ pop {a1,pc}
+.endm
+
+.macro memcpy backwards
+ D .req a1
+ S .req a2
+ N .req a3
+ P .req a4
+ LEAD .req ip
+ OFF .req ip
+ TMP .req lr
+
+ .cfi_startproc
+
+ push {a1,lr}
+ vpush {d8-d9}
+
+ .cfi_def_cfa_offset 16
+ .cfi_rel_offset D, 8
+ .cfi_undefined S
+ .cfi_undefined N
+ .cfi_undefined P
+ .cfi_undefined LEAD
+ .cfi_rel_offset lr, 12
+
+ add ip, D, N
+ /* See if we cross a 64-byte boundary at the destination */
+ .if backwards
+ /* Also point S and D at the buffer ends if working downwards */
+ eor D, ip, D
+ add S, S, N
+ bics D, D, #63
+ mov D, ip
+ beq 170f
+ .else
+ eor ip, ip, D
+ bics ip, ip, #63
+ beq 170f
+ .endif
+
+ /* To preload ahead as we go, we need at least (prefetch_distance+2) 64-byte blocks */
+ .if prefetch_distance > 1
+ movw ip, #(prefetch_distance+3)*64 - 1
+ cmp N, ip
+ .else
+ cmp N, #(prefetch_distance+3)*64 - 1
+ .endif
+ blo 160f
+
+ .if !backwards
+ /* If the data is not in the L2 cache, we get up to a 5% speed
+ * boost by spacing out the instructions with NOPs. Use data
+ * length to estimate whether this is the case. */
+ cmp N, #512*1024 @ L2 cache size for BCM2836 Cortex-A7
+ blo 150f
+
+ sub N, N, #(prefetch_distance+2)*64
+ preload_leading_step1 backwards, P, S, 6
+
+ sub TMP, S, D
+ movs TMP, TMP, lsl #31
+ bhi 148f
+ bcs 147f
+ bmi 146f
+ memcpy_long_inner_loop backwards, 0, 1
+146: memcpy_long_inner_loop backwards, 1, 1
+147: memcpy_long_inner_loop backwards, 2, 1
+148: memcpy_long_inner_loop backwards, 3, 1
+ .endif
+
+150: /* Long case */
+ /* Adjust N so that the decrement instruction can also test for
+ * inner loop termination. We want it to stop when there are
+ * (prefetch_distance+1) complete blocks to go. */
+ sub N, N, #(prefetch_distance+2)*64
+ preload_leading_step1 backwards, P, S, 6
+
+ sub TMP, S, D
+ movs TMP, TMP, lsl #31
+ bhi 158f
+ bcs 157f
+ bmi 156f
+ memcpy_long_inner_loop backwards, 0, 0
+156: memcpy_long_inner_loop backwards, 1, 0
+157: memcpy_long_inner_loop backwards, 2, 0
+158: memcpy_long_inner_loop backwards, 3, 0
+
+160: /* Medium case */
+ preload_all backwards, 0, 0, S, 6, N, OFF, TMP
+
+ sub TMP, S, D
+ movs TMP, TMP, lsl #31
+ bhi 168f
+ bcs 167f
+ bmi 166f
+ memcpy_medium_inner_loop backwards, 0
+166: memcpy_medium_inner_loop backwards, 1
+167: memcpy_medium_inner_loop backwards, 2
+168: memcpy_medium_inner_loop backwards, 3
+
+170: /* Short case, less than 127 bytes, so no guarantee of at least one 64-byte block */
+ teq N, #0
+ beq 199f
+ preload_all backwards, 1, 0, S, 6, N, OFF, TMP
+
+ tst D, #3
+ beq 174f
+172: subs N, N, #1
+ blo 199f
+ .if backwards
+ sub S, S, #1
+ sub D, D, #1
+ vld1.8 {d7[7]}, [S]
+ vst1.8 {d7[7]}, [D]
+ .else
+ vld1.8 {d7[7]}, [S]!
+ vst1.8 {d7[7]}, [D]!
+ .endif
+ tst D, #3
+ bne 172b
+174: /* Destination now 4-byte aligned; we have 1 or more output bytes to go */
+ sub TMP, S, D
+ movs TMP, TMP, lsl #31
+ bhi 178f
+ bcs 177f
+ bmi 176f
+ memcpy_short_inner_loop backwards, 0
+176: memcpy_short_inner_loop backwards, 1
+177: memcpy_short_inner_loop backwards, 2
+178: memcpy_short_inner_loop backwards, 3
+
+ .cfi_endproc
+
+ .unreq D
+ .unreq S
+ .unreq N
+ .unreq P
+ .unreq LEAD
+ .unreq OFF
+ .unreq TMP
+.endm
+
+/*
+ * void *memcpy(void * restrict s1, const void * restrict s2, size_t n);
+ * On entry:
+ * a1 = pointer to destination
+ * a2 = pointer to source
+ * a3 = number of bytes to copy
+ * On exit:
+ * a1 preserved
+ */
+
+.set prefetch_distance, 2
+
+myfunc memcpy
+1000: memcpy 0
+ .size memcpy,.-memcpy
+.endfunc
+
+/*
+ * void *memmove(void *s1, const void *s2, size_t n);
+ * On entry:
+ * a1 = pointer to destination
+ * a2 = pointer to source
+ * a3 = number of bytes to copy
+ * On exit:
+ * a1 preserved
+ */
+
+.set prefetch_distance, 2
+
+myfunc memmove
+ cmp a2, a1
+ bpl 1000b /* pl works even over -1 - 0 and 0x7fffffff - 0x80000000 boundaries */
+ memcpy 1
+ .size memmove,.-memmove
+.endfunc
+
+/*
+ * void *mempcpy(void * restrict s1, const void * restrict s2, size_t n);
+ * On entry:
+ * a1 = pointer to destination
+ * a2 = pointer to source
+ * a3 = number of bytes to copy
+ * On exit:
+ * a1 = pointer to immediately after destination block
+ */
+
+myfunc mempcpy
+.global __mempcpy
+.type __mempcpy STT_FUNC
+__mempcpy:
+ push {v1, lr}
+ mov v1, a3
+ bl 1000b
+ add a1, a1, v1
+ pop {v1, pc}
+ .size mempcpy,.-mempcpy
+ .size __mempcpy,.-__mempcpy
--- /dev/null
+/*
+Copyright (c) 2013, Raspberry Pi Foundation
+Copyright (c) 2013, RISC OS Open Ltd
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the copyright holder nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "arm-mem.h"
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+ .text
+ .arch armv6
+ .object_arch armv4
+ .arm
+ .altmacro
+ .p2align 2
+
+/*
+ * void *memset(void *s, int c, size_t n);
+ * On entry:
+ * a1 = pointer to buffer to fill
+ * a2 = byte pattern to fill with (caller-narrowed)
+ * a3 = number of bytes to fill
+ * On exit:
+ * a1 preserved
+ */
+myfunc memset
+ S .req a1
+ DAT0 .req a2
+ N .req a3
+ DAT1 .req a4
+ DAT2 .req ip
+ DAT3 .req lr
+
+ and DAT0, DAT0, #255
+ push {S, lr}
+ orr DAT0, DAT0, lsl #8
+ orr DAT0, DAT0, lsl #16
+ mov DAT1, DAT0
+
+ /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */
+ cmp N, #31
+ blo 170f
+
+161: sub N, N, #16 /* simplifies inner loop termination */
+ /* Leading words and bytes */
+ tst S, #15
+ beq 164f
+ rsb DAT3, S, #0 /* bits 0-3 = number of leading bytes until aligned */
+ movs DAT2, DAT3, lsl #31
+ submi N, N, #1
+ strmib DAT0, [S], #1
+ subcs N, N, #2
+ strcsh DAT0, [S], #2
+ movs DAT2, DAT3, lsl #29
+ submi N, N, #4
+ strmi DAT0, [S], #4
+ subcs N, N, #8
+ stmcsia S!, {DAT0, DAT1}
+164: /* Delayed set up of DAT2 and DAT3 so we could use them as scratch registers above */
+ mov DAT2, DAT0
+ mov DAT3, DAT0
+ /* Now the inner loop of 16-byte stores */
+165: stmia S!, {DAT0, DAT1, DAT2, DAT3}
+ subs N, N, #16
+ bhs 165b
+166: /* Trailing words and bytes */
+ movs N, N, lsl #29
+ stmcsia S!, {DAT0, DAT1}
+ strmi DAT0, [S], #4
+ movs N, N, lsl #2
+ strcsh DAT0, [S], #2
+ strmib DAT0, [S]
+199: pop {S, pc}
+
+170: /* Short case */
+ mov DAT2, DAT0
+ mov DAT3, DAT0
+ tst S, #3
+ beq 174f
+172: subs N, N, #1
+ blo 199b
+ strb DAT0, [S], #1
+ tst S, #3
+ bne 172b
+174: tst N, #16
+ stmneia S!, {DAT0, DAT1, DAT2, DAT3}
+ b 166b
+ .size memset,.-memset
+
+ .unreq S
+ .unreq DAT0
+ .unreq N
+ .unreq DAT1
+ .unreq DAT2
+ .unreq DAT3
+.endfunc
--- /dev/null
+/*
+Copyright (c) 2018, RISC OS Open Ltd
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the copyright holder nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "arm-mem.h"
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+ .text
+ .fpu neon
+ .arch armv7a
+ .object_arch armv4
+ .arm
+ .altmacro
+ .p2align 2
+
+/*
+ * void *memset(void *s, int c, size_t n);
+ * On entry:
+ * a1 = pointer to buffer to fill
+ * a2 = byte pattern to fill with (caller-narrowed)
+ * a3 = number of bytes to fill
+ * On exit:
+ * a1 preserved
+ */
+myfunc memset
+ SJ .req a2
+ N .req a3
+ SI .req a4
+ OFF .req ip
+
+ mov SI, a1
+ vdup.8 q0, a2
+ cmp N, #15+64
+ vdup.8 q1, a2
+ blo 170f
+
+161: ands ip, a1, #15
+ beq 164f
+ rsb ip, ip, #16 /* number of leading bytes until 16-byte aligned */
+ sub N, N, ip
+ rbit ip, ip
+ tst a1, #1 /* bit 0 unaffected by rsb so can avoid register interlock */
+ strneb a2, [SI], #1
+ movs ip, ip, lsl #2
+ strcsb a2, [SI, #1]
+ strcsb a2, [SI], #2
+ vstmmi SI!, {s0}
+ movs ip, ip, lsl #2
+ vstmcs SI!, {d0}
+164: /* Setup for the inner loop */
+ mov OFF, #64
+ sub N, N, #64 /* simplifies inner loop termination */
+ add SJ, SI, #32
+ /* Now the inner loop of 2x32-byte stores */
+165: vst1.8 {q0-q1}, [SI :128], OFF
+ subs N, N, #64
+ vst1.8 {q0-q1}, [SJ :128], OFF
+ bhs 165b
+ /* Trailing words and bytes */
+166: vmov.32 a2, d0[0]
+ movs N, N, lsl #27
+ bcc 167f
+ vst1.8 {q0-q1}, [SI]!
+167: bpl 168f
+ vst1.8 {q0}, [SI]!
+168: movs N, N, lsl #2
+ vstmcs SI!, {d0}
+ strmi a2, [SI], #4
+ movs N, N, lsl #2
+ strcsh a2, [SI], #2
+ strmib a2, [SI]
+199: bx lr
+
+170: /* Short case */
+ tst SI, #3
+ beq 174f
+172: subs N, N, #1
+ blo 199b
+ strb a2, [SI], #1
+ tst SI, #3
+ bne 172b
+174: cmp N, #32
+ bcc 166b
+ vst1.8 {q0-q1}, [SI]!
+ sub N, N, #32
+ b 166b
+ .size memset,.-memset
+
+ .unreq SJ
+ .unreq N
+ .unreq SI
+ .unreq OFF
+.endfunc
--- /dev/null
+/*
+Copyright (c) 2019, RISC OS Open Ltd
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the copyright holder nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "arm-mem.h"
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+ .text
+ .fpu neon
+ .arch armv7a
+ .object_arch armv4
+ .arm
+ .altmacro
+ .p2align 2
+
+/*
+ * size_t strlen (const char *__s);
+ * On entry:
+ * a1 = pointer to string
+ * On exit:
+ * a1 = length of string, exclusing terminator
+ */
+myfunc strlen
+ PTR .req a1
+ OPTR .req a2
+ MASK .req a3
+ TMP0 .req a4
+ TMP1 .req v1
+ TMP2 .req ip
+ TMP3 .req lr
+
+ push {v1,lr}
+ mov OPTR, PTR
+ movw MASK, #0xff8
+ tst PTR, #7
+ bne 20f
+ bics TMP0, MASK, PTR
+ beq 20f
+
+10: /* Handle 16 SIMD bytes per iteration until we hit a load that crosses a page boundary */
+ /* Loop rotated so that termination test is in what would otherwise be a stall */
+ vld1.8 {d0,d1}, [PTR :64]!
+ bics TMP0, MASK, PTR
+ beq 12f
+11: vceq.i8 d0, #0
+ vceq.i8 d1, #0
+ vmov TMP0, s0
+ vmov TMP1, s1
+ vmov TMP2, s2
+ vmov TMP3, s3
+ teq TMP0, #0
+ teqeq TMP1, #0
+ teqeq TMP2, #0
+ teqeq TMP3, #0
+ bne 33f
+ vld1.8 {d0,d1}, [PTR :64]!
+ bics TMP0, MASK, PTR
+ bne 11b
+12: vceq.i8 d0, #0
+ vceq.i8 d1, #0
+ vmov TMP0, s0
+ vmov TMP1, s1
+ vmov TMP2, s2
+ vmov TMP3, s3
+ teq TMP0, #0
+ teqeq TMP1, #0
+ teqeq TMP2, #0
+ teqeq TMP3, #0
+ bne 33f
+ /* Drop through... */
+
+20: /* Handle one byte per iteration, for leading unaligned bytes or when approaching a page boundary */
+ ldrb TMP0, [PTR], #1
+21: tst PTR, #7
+ beq 22f
+ teq TMP0, #0
+ beq 23f
+ ldrb TMP0, [PTR], #1
+ b 21b
+
+22: teq TMP0, #0
+ beq 23f
+ bics TMP0, MASK, PTR
+ bne 10b
+ b 20b
+
+23: /* Terminating null found during single-byte iteration */
+ sub a1, PTR, OPTR
+ sub a1, #1
+ pop {v1,pc}
+
+30: /* Terminating null found within TMP0 during SIMD iteration */
+ rev TMP0, TMP0
+ clz TMP0, TMP0
+ sub a1, PTR, OPTR
+ sub a1, #16
+ add a1, TMP0, lsr #3
+ pop {v1,pc}
+
+31: /* Terminating null found within TMP1 during SIMD iteration */
+ rev TMP1, TMP1
+ clz TMP1, TMP1
+ sub a1, PTR, OPTR
+ sub a1, #12
+ add a1, TMP1, lsr #3
+ pop {v1,pc}
+
+32: /* Terminating null found within TMP2 during SIMD iteration */
+ rev TMP2, TMP2
+ clz TMP2, TMP2
+ sub a1, PTR, OPTR
+ sub a1, #8
+ add a1, TMP2, lsr #3
+ pop {v1,pc}
+
+33: teq TMP0, #0
+ bne 30b
+ teq TMP1, #0
+ bne 31b
+ teq TMP2, #0
+ bne 32b
+
+ /* Terminating null found within TMP3 during SIMD iteration */
+ rev TMP3, TMP3
+ clz TMP3, TMP3
+ sub a1, PTR, OPTR
+ sub a1, #4
+ add a1, TMP3, lsr #3
+ pop {v1,pc}
+ .size strlen,.-strlen
--- /dev/null
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <assert.h>
+
+#include <sys/mman.h>
+
+//extern size_t mystrlen(const char *s);
+//#define strlen mystrlen
+
+#define PAGESIZE 4096
+
+int main(void)
+{
+ /* To check we don't accidentally read off the end of the string
+ * across a page boundary, do our tests up to a mapped-out page.
+ * To check we handle boundaries between valid pages, we require
+ * two mapped-in pages beforehand.
+ */
+ uint8_t *buffer = mmap(NULL, 3*PAGESIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (buffer == MAP_FAILED)
+ {
+ fprintf(stderr, "mmap() failed\n");
+ exit(EXIT_FAILURE);
+ }
+ if (mprotect(buffer + 2*PAGESIZE, PAGESIZE, PROT_NONE) != 0)
+ {
+ perror("mprotect");
+ munmap(buffer, 3*PAGESIZE);
+ exit(EXIT_FAILURE);
+ }
+
+ for (uint32_t postamble = 0; postamble <= 32; postamble++)
+ {
+ memset(buffer, 'x', 2*PAGESIZE);
+ buffer[2*PAGESIZE - 1 - postamble] = '\0';
+ for (uint32_t start = 0; start <= 2*PAGESIZE - 1 - postamble; start++)
+ assert(strlen(buffer + start) == 2*PAGESIZE - 1 - postamble - start);
+ }
+
+ printf("strlen passes OK\n");
+ munmap(buffer, 3*PAGESIZE);
+ exit(EXIT_SUCCESS);
+}
--- /dev/null
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/time.h>
+
+#define L1CACHESIZE (16*1024)
+#define L2CACHESIZE (128*1024)
+#define KILOBYTE (1024)
+#define MEGABYTE (1024*1024)
+
+#define TESTSIZE (40*MEGABYTE)
+
+#define TILEWIDTH (32)
+#define TINYWIDTH (8)
+
+#if 1
+#define CANDIDATE memcpy
+#define CANDIDATE_RETURN_TYPE void *
+#elif 1
+#define CANDIDATE memset
+#define CANDIDATE_RETURN_TYPE void *
+#elif 1
+#define CANDIDATE memcmp
+#define CANDIDATE_RETURN_TYPE int
+#endif
+
+
+/* Just used for cancelling out the overheads */
+static CANDIDATE_RETURN_TYPE control(const void *s1, const void *s2, size_t n)
+{
+ return 0;
+}
+
+static uint64_t gettime(void)
+{
+ struct timeval tv;
+
+ gettimeofday (&tv, NULL);
+ return tv.tv_sec * 1000000 + tv.tv_usec;
+}
+
+static uint32_t bench_L(CANDIDATE_RETURN_TYPE (*test)(), char *a, char *b, size_t len, size_t times)
+{
+ int i, j, x = 0, q = 0;
+ volatile int qx;
+ for (i = times; i >= 0; i--)
+ {
+ /* Ensure the destination is in cache (if it gets flushed out, source gets reloaded anyway) */
+ for (j = 0; j < len; j += 32)
+ q += a[j];
+ q += a[len-1];
+ x = (x + 1) & 63;
+ test(a + x, b + 63 - x, len);
+ }
+ qx = q;
+ return len * times;
+}
+
+static uint32_t bench_M(CANDIDATE_RETURN_TYPE (*test)(), char *a, char *b, size_t len, size_t times)
+{
+ int i, x = 0;
+ for (i = times; i >= 0; i--)
+ {
+ x = (x + 1) & 63;
+ test(a + x, b + 63 - x, len);
+ }
+ return len * times;
+}
+
+static uint32_t bench_T(CANDIDATE_RETURN_TYPE (*test)(), char *a, char *b, size_t times)
+{
+ uint32_t total = 0;
+ int i, x = 0;
+
+ srand (0);
+ for (i = times; i >= 0; i--)
+ {
+ int w = (rand () % (TILEWIDTH * 2)) + 1;
+ if (x + w > MEGABYTE)
+ x = 0;
+ test(a + x, b + x, w);
+ x += w;
+ total += w;
+ }
+ return total;
+}
+
+static uint32_t bench_R(CANDIDATE_RETURN_TYPE (*test)(), char *a, char *b, size_t times)
+{
+ uint32_t total = 0;
+ int i;
+
+ srand (0);
+ for (i = times; i >= 0; i--)
+ {
+ int w = (rand () % (TILEWIDTH * 2)) + 1;
+ int ax = (rand() % (MEGABYTE - TILEWIDTH * 2));
+ int bx = (rand() % (MEGABYTE - TILEWIDTH * 2));
+ test(a + ax, b + bx, w);
+ total += w;
+ }
+ return total;
+}
+
+static uint32_t bench_RW(CANDIDATE_RETURN_TYPE (*test)(), char *a, char *b, size_t w, size_t times)
+{
+ uint32_t total = 0;
+ int i;
+
+ srand (0);
+ for (i = times; i >= 0; i--)
+ {
+ int ax = (rand() % (MEGABYTE - 1024));
+ int bx = (rand() % (MEGABYTE - 1024));
+ test(a + ax, b + bx, w);
+ total += w;
+ }
+ return total;
+}
+
+static uint32_t bench_RT(CANDIDATE_RETURN_TYPE (*test)(), char *a, char *b, size_t times)
+{
+ uint32_t total = 0;
+ int i;
+
+ srand (0);
+ for (i = times; i >= 0; i--)
+ {
+ int w = (rand () % (TINYWIDTH * 2)) + 1;
+ int ax = (rand() % (MEGABYTE - TINYWIDTH * 2));
+ int bx = (rand() % (MEGABYTE - TINYWIDTH * 2));
+ test(a + ax, b + bx, w);
+ total += w;
+ }
+ return total;
+}
+
+int main(int argc, char *argv[])
+{
+ static __attribute__((aligned(32))) char l1bufa[L1CACHESIZE/2-KILOBYTE];
+ static __attribute__((aligned(32))) char l1bufb[L1CACHESIZE/2-KILOBYTE];
+ static __attribute__((aligned(32))) char l2bufa[L2CACHESIZE/2-KILOBYTE];
+ static __attribute__((aligned(32))) char l2bufb[L2CACHESIZE/2-KILOBYTE];
+ static __attribute__((aligned(32))) char membufa[MEGABYTE];
+ static __attribute__((aligned(32))) char membufb[MEGABYTE];
+ size_t s, d, n;
+ uint64_t t1, t2, t3;
+ uint32_t byte_cnt;
+ size_t iterations;
+
+ srand(0);
+
+ if (argc != 2)
+ {
+ fprintf(stderr, "Syntax: %s <iterations>\n", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+ iterations = atoi(argv[1]);
+
+ memset(l1bufa, 0x5A, sizeof l1bufa);
+ memset(l1bufb, 0x5A, sizeof l1bufb);
+ memset(l2bufa, 0x5A, sizeof l2bufa);
+ memset(l2bufb, 0x5A, sizeof l2bufb);
+ memset(membufa, 0x5A, sizeof membufa);
+ memset(membufb, 0x5A, sizeof membufb);
+
+ // This code was useful for correctness checking.
+ // The "my" prefix was used during development to enable the test harness to function
+ // even when the local implementations were buggy.
+#if 0
+ void *mymemset(void *s, int c, size_t n);
+ void *mymemcpy(void * restrict s1, const void * restrict s2, size_t n);
+ void *mymemmove(void *s1, const void *s2, size_t n);
+ int mymemcmp(const void *s1, const void *s2, size_t n);
+
+// These defines are used to prove that the test harness is correct - to test the local
+// implementations, comment out the #define
+#define mymemset memset
+#define mymemcmp memcmp
+#define mymemcpy memcpy
+ /* Check mymemset */
+ for (d = 0; d < 64; d++)
+ {
+ for (n = 0; n < 192; n++)
+ {
+ memset(l1bufa+d, 0xA5, n);
+ mymemset(l1bufa+d, 0x5A, n);
+ if (memcmp(l1bufa, l1bufb, sizeof l1bufa) != 0)
+ {
+ printf("memset failed (insufficient) with d = %d, n = %d\n", d, n);
+ for (int x = 0; x < sizeof l1bufa; x++)
+ if (l1bufa[x] != 0x5A)
+ printf("Offset %d is wrong\n", x);
+ }
+ mymemset(l1bufa+d, 0xA5, n);
+ memset(l1bufa+d, 0x5A, n);
+ if (memcmp(l1bufa, l1bufb, sizeof l1bufa) != 0)
+ {
+ printf("memset failed (excessive) with d = %d, n = %d\n", d, n);
+ for (int x = 0; x < sizeof l1bufa; x++)
+ if (l1bufa[x] != 0x5A)
+ printf("Offset %d is wrong\n", x);
+ }
+ }
+ }
+
+ /* Check memcmp */
+ {
+#define SIGNOF(x) (((x)>0)-((x)<0))
+ uint32_t a = 0x00010200, b = 0x00020100;
+ int d1,d2;
+ if ((d1=SIGNOF(memcmp(l1bufa, l1bufb, sizeof l1bufa))) != (d2=SIGNOF(mymemcmp(l1bufa, l1bufb, sizeof l1bufa))))
+ printf("memcmp failed (0: %d %d)\n", d1, d2);
+ if ((d1=SIGNOF(memcmp(&a, &b, 4))) != (d2=SIGNOF(mymemcmp(&a, &b, 4))))
+ printf("memcmp failed (1: %d %d)\n", d1, d2);
+ if ((d1=SIGNOF(memcmp(&b, &a, 4))) != (d2=SIGNOF(mymemcmp(&b, &a, 4))))
+ printf("memcmp failed (2: %d %d)\n", d1, d2);
+
+ /*
+ for (size_t i = 32-(((int) l1bufa)&31); i < 32-(((int) l1bufa)&31) + 32; i++)
+ {
+ for (size_t len = 0; len < 256; len++)
+ {
+ mymemcpy(l1bufb+0, l1bufa+i, len);
+ }
+ for (size_t len = 0; len < 256; len++)
+ {
+ mymemcpy(l1bufb+1, l1bufa+i, len);
+ }
+ for (size_t len = 0; len < 256; len++)
+ {
+ mymemcpy(l1bufb+2, l1bufa+i, len);
+ }
+ for (size_t len = 0; len < 256; len++)
+ {
+ mymemcpy(l1bufb+30, l1bufa+i, len);
+ }
+ for (size_t len = 0; len < 256; len++)
+ {
+ mymemcpy(l1bufb+31, l1bufa+i, len);
+ }
+ }
+ */
+
+ memset(l2bufa, 0, sizeof l1bufa);
+ for (size_t i = 0; i < sizeof l1bufa; i += 4)
+ *(uint32_t*)(l1bufa+i) = rand();
+ for (size_t i = 0; i < 64; i++)
+ {
+ printf("%u\n", i);
+ for (size_t j = 0; j < 64; j++)
+ for (size_t len = 0; len < 2048; len++)
+ {
+ int myresult;
+ int trueresult;
+ memset(l1bufb, 0, sizeof l1bufb);
+ mymemcpy(l1bufb+j, l1bufa+i, len);
+ if (memcmp(l1bufb+j, l1bufa+i, len) != 0)
+ {
+ printf("memcpy failed (data: %u %u %u)\n", i, j, len);
+ printf("should be");
+ for (size_t x = 0; x < len; x++)
+ printf(" %02X%s", l1bufa[i+x] & 0xFF, l1bufa[i+x] != l1bufb[j+x] ? "*" : "");
+ printf("\nbut is ");
+ for (size_t x = 0; x < len; x++)
+ printf(" %02X%s", l1bufb[j+x] & 0xFF, l1bufa[i+x] != l1bufb[j+x] ? "*" : "");
+ printf("\n");
+ }
+ else if ((myresult = mymemcmp(l1bufb+j, l1bufa+i, len)) != 0)
+ {
+ printf("memcmp failed (%u %u %u) was %08x (%c0), should be =0\n", i, j, len, myresult, "<=>"[SIGNOF(myresult) + 1]);
+ myresult = mymemcmp(l1bufb+j, l1bufa+i, len);
+ }
+ for (size_t k = 0; k + 1 < len && k + 1 < 20; k++)
+ {
+ size_t k2 = len - 2 - k;
+ l1bufb[j+k] ^= 0x80;
+ l1bufb[j+k+1] ^= 0x80;
+
+ myresult = mymemcmp(l1bufb+j, l1bufa+i, len);
+ trueresult = memcmp(l1bufb+j, l1bufa+i, len);
+ if (SIGNOF(myresult) != SIGNOF(trueresult))
+ {
+ printf("memcmp failed (%u %u %u with diff at %u was %08x (%c0), should be %c0\n",
+ i, j, len, k,
+ myresult,
+ "<=>"[SIGNOF(myresult) + 1],
+ "<=>"[SIGNOF(trueresult) + 1]);
+ myresult = mymemcmp(l1bufb+j, l1bufa+i, len);
+ }
+ l1bufb[j+k] ^= 0x80;
+ l1bufb[j+k+1] ^= 0x80;
+ l1bufb[j+k2] ^= 0x80;
+ l1bufb[j+k2+1] ^= 0x80;
+ myresult = mymemcmp(l1bufb+j, l1bufa+i, len);
+ trueresult = memcmp(l1bufb+j, l1bufa+i, len);
+ if (SIGNOF(myresult) != SIGNOF(trueresult))
+ {
+ printf("memcmp failed (%u %u %u with diff at %u was %08x (%c0), should be %c0\n",
+ i, j, len, k2,
+ myresult,
+ "<=>"[SIGNOF(myresult) + 1],
+ "<=>"[SIGNOF(trueresult) + 1]);
+ myresult = mymemcmp(l1bufb+j, l1bufa+i, len);
+ }
+ l1bufb[j+k2] ^= 0x80;
+ l1bufb[j+k2+1] ^= 0x80;
+ }
+ if (memcmp(l1bufb, l2bufa, j) != 0)
+ printf("memcpy failed (before: %u %u %u)\n", i, j, len);
+ if (memcmp(l1bufb+j+len, l2bufa, sizeof l1bufa -j-len) != 0)
+ printf("memcpy failed (after: %u %u %u)\n", i, j, len);
+ }
+ }
+ }
+#endif
+
+ // This code is for benchmarking
+#if 1
+ printf("L1, L2, M, T, R, RT\n");
+
+ while (iterations--)
+ {
+ memcpy(l1bufa, l1bufb, sizeof l1bufa);
+ memcpy(l1bufb, l1bufa, sizeof l1bufa);
+
+ t1 = gettime();
+ bench_L(control, l1bufa, l1bufb, sizeof l1bufa - 64, TESTSIZE / (sizeof l1bufa - 64));
+ t2 = gettime();
+ byte_cnt = bench_L(CANDIDATE, l1bufa, l1bufb, sizeof l1bufa - 64, TESTSIZE / (sizeof l1bufa - 64));
+ t3 = gettime();
+ printf("%6.2f, ", ((double)byte_cnt) / ((t3 - t2) - (t2 - t1)));
+ fflush(stdout);
+
+ memcpy(l2bufa, l2bufb, sizeof l2bufa);
+ memcpy(l2bufb, l2bufa, sizeof l2bufa);
+
+ t1 = gettime();
+ bench_L(control, l2bufa, l2bufb, sizeof l2bufa - 64, TESTSIZE / (sizeof l2bufa - 64));
+ t2 = gettime();
+ byte_cnt = bench_L(CANDIDATE, l2bufa, l2bufb, sizeof l2bufa - 64, TESTSIZE / (sizeof l2bufa - 64));
+ t3 = gettime();
+ printf("%6.2f, ", ((double)byte_cnt) / ((t3 - t2) - (t2 - t1)));
+ fflush(stdout);
+
+ memcpy(membufa, membufb, sizeof membufa);
+ memcpy(membufb, membufa, sizeof membufa);
+
+ t1 = gettime();
+ bench_M(control, membufa, membufb, sizeof membufa - 64, TESTSIZE / (sizeof membufa - 64));
+ t2 = gettime();
+ byte_cnt = bench_M(CANDIDATE, membufa, membufb, sizeof membufa - 64, TESTSIZE / (sizeof membufa - 64));
+ t3 = gettime();
+ printf("%6.2f, ", ((double)byte_cnt) / ((t3 - t2) - (t2 - t1)));
+ fflush(stdout);
+
+ memcpy(membufa, membufb, sizeof membufa);
+ memcpy(membufb, membufa, sizeof membufa);
+
+ t1 = gettime();
+ bench_T(control, membufa, membufb, TESTSIZE / (TILEWIDTH*2));
+ t2 = gettime();
+ byte_cnt = bench_T(CANDIDATE, membufa, membufb, TESTSIZE / (TILEWIDTH*2));
+ t3 = gettime();
+ printf("%6.2f, ", ((double)byte_cnt) / ((t3 - t2) - (t2 - t1)));
+ fflush(stdout);
+
+ memcpy(membufa, membufb, sizeof membufa);
+ memcpy(membufb, membufa, sizeof membufa);
+
+ t1 = gettime();
+ bench_R(control, membufa, membufb, TESTSIZE / (TILEWIDTH*2));
+ t2 = gettime();
+ byte_cnt = bench_R(CANDIDATE, membufa, membufb, TESTSIZE / (TILEWIDTH*2));
+ t3 = gettime();
+ printf("%6.2f, ", ((double)byte_cnt) / ((t3 - t2) - (t2 - t1)));
+ fflush(stdout);
+
+ memcpy(membufa, membufb, sizeof membufa);
+ memcpy(membufb, membufa, sizeof membufa);
+
+ t1 = gettime();
+ bench_RT(control, membufa, membufb, TESTSIZE / (TILEWIDTH*2));
+ t2 = gettime();
+ byte_cnt = bench_RT(CANDIDATE, membufa, membufb, TESTSIZE / (TILEWIDTH*2));
+ t3 = gettime();
+ printf("%6.2f\n", ((double)byte_cnt) / ((t3 - t2) - (t2 - t1)));
+ fflush(stdout);
+ }
+#elif 0
+ const char *sep = "";
+ for (int w = 1; w <= 100; w++)
+ {
+ printf("%sW%d", sep, w);
+ sep = ", ";
+ }
+ printf("\n");
+
+ while (iterations--)
+ {
+ sep = "";
+ for (int w = 1; w <= 100; w++)
+ {
+ memcpy(membufa, membufb, sizeof membufa);
+ memcpy(membufb, membufa, sizeof membufa);
+
+ t1 = gettime();
+ bench_RW(control, membufa, membufb, w, TESTSIZE / w);
+ t2 = gettime();
+ byte_cnt = bench_RW(CANDIDATE, membufa, membufb, w, TESTSIZE / w);
+ t3 = gettime();
+ printf("%s%6.2f", sep, ((double)byte_cnt) / ((t3 - t2) - (t2 - t1)));
+ sep = ", ";
+ fflush(stdout);
+ }
+ printf("\n");
+ }
+#endif
+}