From 246f46500d4654390333dcf92ac94fdbda2c9257 Mon Sep 17 00:00:00 2001 From: notaz Date: Wed, 23 Oct 2024 00:16:39 +0300 Subject: [PATCH] git subrepo clone https://github.com/bavison/arm-mem deps/arm-mem subrepo: subdir: "deps/arm-mem" merged: "ee8ac1d5" upstream: origin: "https://github.com/bavison/arm-mem" branch: "master" commit: "ee8ac1d5" git-subrepo: version: "0.4.9" origin: "https://github.com/ingydotnet/git-subrepo.git" commit: "57de7d6" --- deps/arm-mem/.gitrepo | 12 + deps/arm-mem/Makefile | 32 ++ deps/arm-mem/README.md | 12 + deps/arm-mem/arm-mem.h | 160 +++++++++ deps/arm-mem/memcmp-v6l.S | 295 +++++++++++++++ deps/arm-mem/memcmp-v7l.S | 460 ++++++++++++++++++++++++ deps/arm-mem/memcpymove-v6l.S | 578 +++++++++++++++++++++++++++++ deps/arm-mem/memcpymove-v7l.S | 659 ++++++++++++++++++++++++++++++++++ deps/arm-mem/memset-v6l.S | 122 +++++++ deps/arm-mem/memset-v7l.S | 120 +++++++ deps/arm-mem/strlen-v7l.S | 157 ++++++++ deps/arm-mem/test-strlen.c | 45 +++ deps/arm-mem/test.c | 421 ++++++++++++++++++++++ 13 files changed, 3073 insertions(+) create mode 100644 deps/arm-mem/.gitrepo create mode 100644 deps/arm-mem/Makefile create mode 100644 deps/arm-mem/README.md create mode 100644 deps/arm-mem/arm-mem.h create mode 100644 deps/arm-mem/memcmp-v6l.S create mode 100644 deps/arm-mem/memcmp-v7l.S create mode 100644 deps/arm-mem/memcpymove-v6l.S create mode 100644 deps/arm-mem/memcpymove-v7l.S create mode 100644 deps/arm-mem/memset-v6l.S create mode 100644 deps/arm-mem/memset-v7l.S create mode 100644 deps/arm-mem/strlen-v7l.S create mode 100644 deps/arm-mem/test-strlen.c create mode 100644 deps/arm-mem/test.c diff --git a/deps/arm-mem/.gitrepo b/deps/arm-mem/.gitrepo new file mode 100644 index 00000000..0b2a61da --- /dev/null +++ b/deps/arm-mem/.gitrepo @@ -0,0 +1,12 @@ +; DO NOT EDIT (unless you know what you are doing) +; +; This subdirectory is a git "subrepo", and this file is maintained by the +; git-subrepo command. See https://github.com/ingydotnet/git-subrepo#readme +; +[subrepo] + remote = https://github.com/bavison/arm-mem + branch = master + commit = ee8ac1d56adb7ceef4d39a5cc21a502e41982685 + parent = 6fb01036deffd69da0af72ad1f5cf2b5fedd04d2 + method = merge + cmdver = 0.4.9 diff --git a/deps/arm-mem/Makefile b/deps/arm-mem/Makefile new file mode 100644 index 00000000..bf03f0f6 --- /dev/null +++ b/deps/arm-mem/Makefile @@ -0,0 +1,32 @@ +OBJS-V6L = memcmp-v6l.o memcpymove-v6l.o memset-v6l.o +OBJS-V7L = memcmp-v7l.o memcpymove-v7l.o memset-v7l.o strlen-v7l.o +CFLAGS += -std=gnu99 -O2 -fno-inline + +all: libarmmem-v6l.so libarmmem-v6l.a libarmmem-v7l.so libarmmem-v7l.a test test-strlen + +%.o: %.c + $(CROSS_COMPILE)gcc $(CFLAGS) -c -o $@ $^ + +%.o: %.S + $(CROSS_COMPILE)gcc -c -o $@ $^ + +libarmmem-v6l.so: $(OBJS-V6L) + $(CROSS_COMPILE)gcc -shared -o $@ -Wl,-soname,$@ $^ + +libarmmem-v6l.a: $(OBJS-V6L) + $(CROSS_COMPILE)ar rcs $@ $^ + +libarmmem-v7l.so: $(OBJS-V7L) + $(CROSS_COMPILE)gcc -shared -o $@ -Wl,-soname,$@ $^ + +libarmmem-v7l.a: $(OBJS-V7L) + $(CROSS_COMPILE)ar rcs $@ $^ + +test: test.o + $(CROSS_COMPILE)gcc -o $@ $^ + +test-strlen: test-strlen.o + $(CROSS_COMPILE)gcc -o $@ $^ + +clean: + rm -rf *.o *.so *.a test diff --git a/deps/arm-mem/README.md b/deps/arm-mem/README.md new file mode 100644 index 00000000..a270674b --- /dev/null +++ b/deps/arm-mem/README.md @@ -0,0 +1,12 @@ +arm-mem +======= + +ARM-accelerated versions of selected functions from string.h + +To build the library, use +$ make +or, if cross-compiling, use +$ CROSS_COMPILE=arm-linux-gnueabihf- make + +Also included is a simple test harness, inspired by the benchmarker +from the pixman library. This can be built via the "test" make target. diff --git a/deps/arm-mem/arm-mem.h b/deps/arm-mem/arm-mem.h new file mode 100644 index 00000000..e6ddb8ef --- /dev/null +++ b/deps/arm-mem/arm-mem.h @@ -0,0 +1,160 @@ +/* +Copyright (c) 2013, Raspberry Pi Foundation +Copyright (c) 2013, RISC OS Open Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +.macro myfunc fname + .func fname + .global fname + .type fname STT_FUNC +fname: +.endm + +.macro preload_leading_step1 backwards, ptr, base, log2cl +/* If the destination is already write-block aligned, then we need to preload + * between 0 and prefetch_distance (inclusive) cache lines ahead so there + * are no gaps when the inner loop starts. + */ + .if backwards + sub ptr, base, #1 + bic ptr, ptr, #(1<0/=0/<0 if s1 >/=/< s2 + */ + +.set prefetch_distance, 2 + +myfunc memcmp + S_1 .req a1 + S_2 .req a2 + N .req a3 + DAT0 .req a4 + DAT1 .req v1 + DAT2 .req v2 + DAT3 .req v3 + DAT4 .req v4 + DAT5 .req v5 + DAT6 .req v6 + DAT7 .req ip + OFF .req lr + + push {DAT1-DAT6, lr} + + /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */ + cmp N, #(prefetch_distance+3)*32 - 1 + blo 170f + + /* Long case */ + /* Adjust N so that the decrement instruction can also test for + * inner loop termination. We want it to stop when there are + * (prefetch_distance+1) complete blocks to go. */ + sub N, N, #(prefetch_distance+2)*32 + preload_leading_step1 0, DAT0, S_1, 5 + preload_leading_step1 0, DAT1, S_2, 5 + tst S_2, #31 + beq 154f + rsb OFF, S_2, #0 /* no need to AND with 15 here */ + preload_leading_step2 0, DAT0, S_1, 5, OFF, DAT2 + preload_leading_step2 0, DAT1, S_2, 5, OFF, DAT2 + memcmp_leading_31bytes +154: /* Second source now cacheline (32-byte) aligned; we have at + * least one prefetch to go. */ + /* Prefetch offset is best selected such that it lies in the + * first 8 of each 32 bytes - but it's just as easy to aim for + * the first one */ + and OFF, S_1, #31 + rsb OFF, OFF, #32*prefetch_distance + tst S_1, #3 + bne 140f + memcmp_long_inner_loop 0 +140: memcmp_long_inner_loop 1 + +170: /* Short case */ + teq N, #0 + beq 199f + preload_all 0, 0, 0, S_1, 5, N, DAT0, DAT1 + preload_all 0, 0, 0, S_2, 5, N, DAT0, DAT1 + tst S_2, #3 + beq 174f +172: subs N, N, #1 + blo 199f + ldrb DAT0, [S_1], #1 + ldrb DAT4, [S_2], #1 + cmp DAT0, DAT4 + bne 200f + tst S_2, #3 + bne 172b +174: /* Second source now 4-byte aligned; we have 0 or more bytes to go */ + tst S_1, #3 + bne 140f + memcmp_short_inner_loop 0 +140: memcmp_short_inner_loop 1 + +200: /* Difference found: determine sign. */ + rev DAT0, DAT0 + rev DAT4, DAT4 + rev DAT1, DAT1 + rev DAT5, DAT5 + rev DAT2, DAT2 + rev DAT6, DAT6 + rev DAT3, DAT3 + rev DAT7, DAT7 + + cmp DAT0, DAT4 + cmpeq DAT1, DAT5 + cmpeq DAT2, DAT6 + cmpeq DAT3, DAT7 + + movhi a1, #1 + movlo a1, #-1 + pop {DAT1-DAT6, pc} + .size memcmp,.-memcmp + + .unreq S_1 + .unreq S_2 + .unreq N + .unreq DAT0 + .unreq DAT1 + .unreq DAT2 + .unreq DAT3 + .unreq DAT4 + .unreq DAT5 + .unreq DAT6 + .unreq DAT7 + .unreq OFF +.endfunc diff --git a/deps/arm-mem/memcmp-v7l.S b/deps/arm-mem/memcmp-v7l.S new file mode 100644 index 00000000..36e3fed5 --- /dev/null +++ b/deps/arm-mem/memcmp-v7l.S @@ -0,0 +1,460 @@ +/* +Copyright (c) 2019, RISC OS Open Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "arm-mem.h" + +/* Prevent the stack from becoming executable */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + + .text + .fpu neon + .arch armv7a + .object_arch armv4 + .arm + .altmacro + .p2align 2 + + .altmacro + +/* Load 32 bytes from both buffers (8-byte aligned) post-incrementing the pointers + * r0q-r1q are unused, but retained so we have identical parameters to load_32b_x2_unaligned + * r0d-r3d are filled with data from S_1 + * r4d-r7d are filled with data from S_2 + * switch_loads indicates that we should re-order the loads to assist with scheduling a following pld + * I1-I8 are optional instructions to insert into stalls + */ +.macro load_32b_x2_aligned r0q, r1q, r0d, r1d, r2d, r3d, r4d, r5d, r6d, r7d, switch_loads, I1, I2, I3, I4, I5, I6, I7, I8 + .if switch_loads == 1 + vld1.32 {\r4d}, [S_2 :64]! + \I1 + vld1.32 {\r0d}, [S_1 :64]! + \I2 + vld1.32 {\r5d}, [S_2 :64]! + \I3 + vld1.32 {\r1d}, [S_1 :64]! + \I4 + vld1.32 {\r6d}, [S_2 :64]! + \I5 + vld1.32 {\r2d}, [S_1 :64]! + \I6 + vld1.32 {\r7d}, [S_2 :64]! + \I7 + vld1.32 {\r3d}, [S_1 :64]! + \I8 + .else + vld1.32 {\r0d}, [S_1 :64]! + \I1 + vld1.32 {\r4d}, [S_2 :64]! + \I2 + vld1.32 {\r1d}, [S_1 :64]! + \I3 + vld1.32 {\r5d}, [S_2 :64]! + \I4 + vld1.32 {\r2d}, [S_1 :64]! + \I5 + vld1.32 {\r6d}, [S_2 :64]! + \I6 + vld1.32 {\r3d}, [S_1 :64]! + \I7 + vld1.32 {\r7d}, [S_2 :64]! + \I8 + .endif +.endm + +/* Load 32 bytes from both buffers (S_1 rounded up to 8-byte boundary, S_2 8-byte aligned), post-incrementing the pointers + * S_1A, S_2A are 8 bytes on from S_1, S_2 + * SIXTEEN is constant #16 + * r0q-r1q are Q-reg names for r0d-r3d + * r0d-r3d are filled with data from S_1 + * r4d-r7d are filled with data from S_2 + * switch_loads is ignored in this case + * I1-I8 are optional instructions to insert into stalls + * d2-d6 are used as temporaries + * d7 on entry and exit holds the content of aligned 8-byte block containing "true" value of S_1 + * d8.u8[0] = - ((("true" S_1) & 7) * 8) + * d9.u8[0] = 64 + d8.u8[0] + */ +.macro load_32b_x2_unaligned r0q, r1q, r0d, r1d, r2d, r3d, r4d, r5d, r6d, r7d, switch_loads, I1, I2, I3, I4, I5, I6, I7, I8 + vld1.32 {d4}, [S_1 :64], SIXTEEN + \I1 + vld1.32 {d5}, [S_1A :64], SIXTEEN + vshl.u64 \r0d, d7, d8 + vld1.32 {d6}, [S_1 :64], SIXTEEN + \I2 + vld1.32 {d7}, [S_1A :64], SIXTEEN + vshl.u64 d2, d4, d9 + vld1.32 {\r4d}, [S_2 :64], SIXTEEN + vshl.u64 \r1d, d4, d8 + vld1.32 {\r5d}, [S_2A :64], SIXTEEN + vshl.u64 d3, d5, d9 + vld1.32 {\r6d}, [S_2 :64], SIXTEEN + vshl.u64 \r2d, d5, d8 + vld1.32 {\r7d}, [S_2A :64], SIXTEEN + vshl.u64 d4, d6, d9 + vshl.u64 \r3d, d6, d8 + vshl.u64 d5, d7, d9 + vorr \r0q, q1 + \I8 + \I3 + \I4 + \I5 + \I6 + \I7 + vorr \r1q, q2 +.endm + +.macro process_32b_blocks load_macro + // Process these as an odd number of 32-byte full blocks, + // then a partial block of up to 63 trailing bytes + cmp N, #32 + sub N, #64 + bmi 20f + \load_macro q8, q9, d16, d17, d18, d19, d20, d21, d22, d23, 0 + veor.u8 q0, q8, q10 + subs N, #32 + veor.u8 q1, q9, q11 + bmi 9f +1: \load_macro q12, q13, d24, d25, d26, d27, d28, d29, d30, d31, 0, \ + , \ + , \ + , \ + , \ + , \ + , \ + , \ + + orrs RES, TMP1, TMP2 + veor.u8 q1, q13, q15 + bne 33f + \load_macro q8, q9, d16, d17, d18, d19, d20, d21, d22, d23, 1, \ + , \ + , \ + , \ + , \ + , \ + , \ + , \ + + orrs RES, TMP1, TMP2 + veor.u8 q1, q9, q11 + bne 31f + subs N, #64 + bpl 1b +9: vorr q0, q1 + vorr d0, d1 + vmov TMP1, s0 + vmov TMP2, s1 + orrs RES, TMP1, TMP2 + bne 33f +10: tst N, #32 + beq 14f + \load_macro q8, q9, d16, d17, d18, d19, d20, d21, d22, d23, 0 + veor.u8 q0, q8, q10 + veor.u8 q1, q9, q11 + vorr q0, q1 + vorr d0, d1 + vmov TMP1, s0 + vmov TMP2, s1 + orrs RES, TMP1, TMP2 + bne 33f +14: +.endm + +/* + * int memcmp(const void *s1, const void *s2, size_t n); + * On entry: + * a1 = pointer to buffer 1 + * a2 = pointer to buffer 2 + * a3 = number of bytes to compare (as unsigned chars) + * On exit: + * a1 = >0/=0/<0 if s1 >/=/< s2 + */ + +.set prefetch_distance, 63 + +myfunc memcmp + RES .req a1 + S_2 .req a2 + N .req a3 + S_1 .req a4 + S_1A .req v1 + S_2A .req v2 + SIXTEEN .req v3 + TMP1 .req ip + TMP2 .req lr + + // Based on real-world data, we are actually very likely to find a + // difference within the first few bytes, so it's unlikely to be + // beneficial to vectorise these. Test first 1+ bytes individually, + // stopping when we have at least the s2 pointer 8-byte aligned. + mov S_1, a1 + and RES, S_2, #7 + push {lr} + rsb RES, #7 + subs N, #1 + ldrcsb TMP2, [S_2], #1 + ldrcsb TMP1, [S_1], #1 + bcc 43f + cmp RES, N + movcs RES, N + teq RES, #0 + beq 9f + sub N, RES +1: cmp TMP1, TMP2 + ldrb TMP1, [S_1], #1 + bne 41f + ldrb TMP2, [S_2], #1 + subs RES, #1 + bne 1b +9: cmp TMP1, TMP2 + bne 41f + teq N, #0 + beq 43f // because it's very common to have found a match by now + + tst S_1, #7 + bne 50f + + // Both aligned + process_32b_blocks load_32b_x2_aligned + lsls N, #32-5 + beq 43f + bpl 15f + vld1.32 {d16}, [S_1 :64]! + vld1.32 {d20}, [S_2 :64]! + vld1.32 {d17}, [S_1 :64]! + vld1.32 {d21}, [S_2 :64]! +15: lsls N, #2 + bcc 16f + vld1.32 {d18}, [S_1 :64]! + vld1.32 {d22}, [S_2 :64]! +16: bpl 17f + vld1.32 {d19[0]}, [S_1 :32]! + vld1.32 {d23[0]}, [S_2 :32]! +17: lsls N, #2 + bcc 18f + vld1.16 {d19[2]}, [S_1 :16]! + vld1.16 {d23[2]}, [S_2 :16]! +18: bpl 19f + vld1.8 {d19[6]}, [S_1]! + vld1.8 {d23[6]}, [S_2]! +19: veor.u8 q0, q8, q10 + veor.u8 q1, q9, q11 + vorr q0, q1 + vorr d0, d1 + vmov TMP1, s0 + vmov TMP2, s1 + orrs RES, TMP1, TMP2 + bne 33f + pop {pc} + +20: // Make both banks match so the holes between loads won't affect result + vmov q8, q10 + vmov q9, q11 + b 10b + +31: // Diff found in q12-q15 + push {v1,v2} + vrev32.8 q0, q12 + vrev32.8 q1, q14 + vmov a1, a2, d0 + vmov a3, a4, d2 + vmov v1, v2, d1 + vmov ip, lr, d3 + cmp a3, a1 + vrev32.8 q0, q13 + cmpeq a4, a2 + vrev32.8 q1, q15 + cmpeq ip, v1 + vmov a1, a2, d0 + cmpeq lr, v2 + vmov a3, a4, d2 + movne RES, #1 + vmov v1, v2, d1 + bne 32f + vmov ip, lr, d3 + cmp a3, a1 + cmpeq a4, a2 + mov RES, #1 + cmpeq ip, v1 + cmpeq lr, v2 +32: subcs RES, #2 + pop {v1,v2,pc} + +33: // Diff found in q8-q11 + push {v1,v2} + vrev32.8 q0, q8 + vrev32.8 q1, q10 + vmov a1, a2, d0 + vmov a3, a4, d2 + vmov v1, v2, d1 + vmov ip, lr, d3 + cmp a3, a1 + vrev32.8 q0, q9 + cmpeq a4, a2 + vrev32.8 q1, q11 + cmpeq ip, v1 + vmov a1, a2, d0 + cmpeq lr, v2 + vmov a3, a4, d2 + movne RES, #1 + vmov v1, v2, d1 + bne 34f + vmov ip, lr, d3 + cmp a3, a1 + cmpeq a4, a2 + mov RES, #1 + cmpeq ip, v1 + cmpeq lr, v2 +34: subcs RES, #2 + pop {v1,v2,pc} + +41: movcc RES, #-1 + movcs RES, #1 + pop {pc} + +43: mov RES, #0 + pop {pc} + + +50: // Only S_2 is aligned + push {v1-v3} + and v3, S_1, #7 + bic S_1, #7 + add S_1A, S_1, #16 + add S_2A, S_2, #8 + vpush {q4} + lsl v3, #3 + rsb v3, #0 + vld1.32 {d7}, [S_1 :64]! + vmov s16, v3 + add v3, #64 + vmov s18, v3 + mov SIXTEEN, #16 + process_32b_blocks load_32b_x2_unaligned + lsls N, #32-5 + beq 43f + // Reapply the offset to S_1 and use unaligned loads from here on + vmov TMP1, s16 + sub S_1, #8 + sub S_1, TMP1, asr #3 + bpl 15f + vld1.32 {d16}, [S_1]! + vld1.32 {d20}, [S_2 :64]! + vld1.32 {d17}, [S_1]! + vld1.32 {d21}, [S_2 :64]! +15: lsls N, #2 + bcc 16f + vld1.32 {d18}, [S_1]! + vld1.32 {d22}, [S_2 :64]! +16: bpl 17f + vld1.32 {d19[0]}, [S_1]! + vld1.32 {d23[0]}, [S_2 :32]! +17: lsls N, #2 + bcc 18f + vld1.16 {d19[2]}, [S_1]! + vld1.16 {d23[2]}, [S_2 :16]! +18: bpl 19f + vld1.8 {d19[6]}, [S_1]! + vld1.8 {d23[6]}, [S_2]! +19: veor.u8 q0, q8, q10 + veor.u8 q1, q9, q11 + vorr q0, q1 + vorr d0, d1 + vmov TMP1, s0 + vmov TMP2, s1 + orrs RES, TMP1, TMP2 + bne 33f + vpop {q4} + pop {v1-v3,pc} + +20: // Make both banks match so the holes between loads won't affect result + vmov q8, q10 + vmov q9, q11 + b 10b + +31: // Diff found in q12-q15 + vrev32.8 q0, q12 + vrev32.8 q1, q14 + vmov a1, a2, d0 + vmov a3, a4, d2 + vmov v1, v2, d1 + vmov ip, lr, d3 + cmp a3, a1 + vrev32.8 q0, q13 + cmpeq a4, a2 + vrev32.8 q1, q15 + cmpeq ip, v1 + vmov a1, a2, d0 + cmpeq lr, v2 + vmov a3, a4, d2 + movne RES, #1 + vmov v1, v2, d1 + bne 32f + vmov ip, lr, d3 + cmp a3, a1 + cmpeq a4, a2 + mov RES, #1 + cmpeq ip, v1 + cmpeq lr, v2 +32: vpop {q4} + subcs RES, #2 + pop {v1-v3,pc} + +33: // Diff found in q8-q11 + vrev32.8 q0, q8 + vrev32.8 q1, q10 + vmov a1, a2, d0 + vmov a3, a4, d2 + vmov v1, v2, d1 + vmov ip, lr, d3 + cmp a3, a1 + vrev32.8 q0, q9 + cmpeq a4, a2 + vrev32.8 q1, q11 + cmpeq ip, v1 + vmov a1, a2, d0 + cmpeq lr, v2 + vmov a3, a4, d2 + movne RES, #1 + vmov v1, v2, d1 + bne 34f + vmov ip, lr, d3 + cmp a3, a1 + cmpeq a4, a2 + mov RES, #1 + cmpeq ip, v1 + cmpeq lr, v2 +34: vpop {q4} + subcs RES, #2 + pop {v1-v3,pc} + +43: vpop {q4} + mov RES, #0 + pop {v1-v3,pc} + .size memcmp,.-memcmp diff --git a/deps/arm-mem/memcpymove-v6l.S b/deps/arm-mem/memcpymove-v6l.S new file mode 100644 index 00000000..8709c989 --- /dev/null +++ b/deps/arm-mem/memcpymove-v6l.S @@ -0,0 +1,578 @@ +/* +Copyright (c) 2013, Raspberry Pi Foundation +Copyright (c) 2013, RISC OS Open Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "arm-mem.h" + +/* Prevent the stack from becoming executable */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + + .text + .arch armv6 + .object_arch armv4 + .arm + .altmacro + .p2align 2 + +.macro unaligned_words backwards, align, use_pld, words, r0, r1, r2, r3, r4, r5, r6, r7, r8 + .if words == 1 + .if backwards + mov r1, r0, lsl #32-align*8 + ldr r0, [S, #-4]! + orr r1, r1, r0, lsr #align*8 + str r1, [D, #-4]! + .else + mov r0, r1, lsr #align*8 + ldr r1, [S, #4]! + orr r0, r0, r1, lsl #32-align*8 + str r0, [D], #4 + .endif + .elseif words == 2 + .if backwards + ldr r1, [S, #-4]! + mov r2, r0, lsl #32-align*8 + ldr r0, [S, #-4]! + orr r2, r2, r1, lsr #align*8 + mov r1, r1, lsl #32-align*8 + orr r1, r1, r0, lsr #align*8 + stmdb D!, {r1, r2} + .else + ldr r1, [S, #4]! + mov r0, r2, lsr #align*8 + ldr r2, [S, #4]! + orr r0, r0, r1, lsl #32-align*8 + mov r1, r1, lsr #align*8 + orr r1, r1, r2, lsl #32-align*8 + stmia D!, {r0, r1} + .endif + .elseif words == 4 + .if backwards + ldmdb S!, {r2, r3} + mov r4, r0, lsl #32-align*8 + ldmdb S!, {r0, r1} + orr r4, r4, r3, lsr #align*8 + mov r3, r3, lsl #32-align*8 + orr r3, r3, r2, lsr #align*8 + mov r2, r2, lsl #32-align*8 + orr r2, r2, r1, lsr #align*8 + mov r1, r1, lsl #32-align*8 + orr r1, r1, r0, lsr #align*8 + stmdb D!, {r1, r2, r3, r4} + .else + ldmib S!, {r1, r2} + mov r0, r4, lsr #align*8 + ldmib S!, {r3, r4} + orr r0, r0, r1, lsl #32-align*8 + mov r1, r1, lsr #align*8 + orr r1, r1, r2, lsl #32-align*8 + mov r2, r2, lsr #align*8 + orr r2, r2, r3, lsl #32-align*8 + mov r3, r3, lsr #align*8 + orr r3, r3, r4, lsl #32-align*8 + stmia D!, {r0, r1, r2, r3} + .endif + .elseif words == 8 + .if backwards + ldmdb S!, {r4, r5, r6, r7} + mov r8, r0, lsl #32-align*8 + ldmdb S!, {r0, r1, r2, r3} + .if use_pld + pld [S, OFF] + .endif + orr r8, r8, r7, lsr #align*8 + mov r7, r7, lsl #32-align*8 + orr r7, r7, r6, lsr #align*8 + mov r6, r6, lsl #32-align*8 + orr r6, r6, r5, lsr #align*8 + mov r5, r5, lsl #32-align*8 + orr r5, r5, r4, lsr #align*8 + mov r4, r4, lsl #32-align*8 + orr r4, r4, r3, lsr #align*8 + mov r3, r3, lsl #32-align*8 + orr r3, r3, r2, lsr #align*8 + mov r2, r2, lsl #32-align*8 + orr r2, r2, r1, lsr #align*8 + mov r1, r1, lsl #32-align*8 + orr r1, r1, r0, lsr #align*8 + stmdb D!, {r5, r6, r7, r8} + stmdb D!, {r1, r2, r3, r4} + .else + ldmib S!, {r1, r2, r3, r4} + mov r0, r8, lsr #align*8 + ldmib S!, {r5, r6, r7, r8} + .if use_pld + pld [S, OFF] + .endif + orr r0, r0, r1, lsl #32-align*8 + mov r1, r1, lsr #align*8 + orr r1, r1, r2, lsl #32-align*8 + mov r2, r2, lsr #align*8 + orr r2, r2, r3, lsl #32-align*8 + mov r3, r3, lsr #align*8 + orr r3, r3, r4, lsl #32-align*8 + mov r4, r4, lsr #align*8 + orr r4, r4, r5, lsl #32-align*8 + mov r5, r5, lsr #align*8 + orr r5, r5, r6, lsl #32-align*8 + mov r6, r6, lsr #align*8 + orr r6, r6, r7, lsl #32-align*8 + mov r7, r7, lsr #align*8 + orr r7, r7, r8, lsl #32-align*8 + stmia D!, {r0, r1, r2, r3} + stmia D!, {r4, r5, r6, r7} + .endif + .endif +.endm + +.macro memcpy_leading_15bytes backwards, align + movs DAT1, DAT2, lsl #31 + sub N, N, DAT2 + .if backwards + ldrmib DAT0, [S, #-1]! + ldrcsh DAT1, [S, #-2]! + strmib DAT0, [D, #-1]! + strcsh DAT1, [D, #-2]! + .else + ldrmib DAT0, [S], #1 + ldrcsh DAT1, [S], #2 + strmib DAT0, [D], #1 + strcsh DAT1, [D], #2 + .endif + movs DAT1, DAT2, lsl #29 + .if backwards + ldrmi DAT0, [S, #-4]! + .if align == 0 + ldmcsdb S!, {DAT1, DAT2} + .else + ldrcs DAT2, [S, #-4]! + ldrcs DAT1, [S, #-4]! + .endif + strmi DAT0, [D, #-4]! + stmcsdb D!, {DAT1, DAT2} + .else + ldrmi DAT0, [S], #4 + .if align == 0 + ldmcsia S!, {DAT1, DAT2} + .else + ldrcs DAT1, [S], #4 + ldrcs DAT2, [S], #4 + .endif + strmi DAT0, [D], #4 + stmcsia D!, {DAT1, DAT2} + .endif +.endm + +.macro memcpy_trailing_15bytes backwards, align + movs N, N, lsl #29 + .if backwards + .if align == 0 + ldmcsdb S!, {DAT0, DAT1} + .else + ldrcs DAT1, [S, #-4]! + ldrcs DAT0, [S, #-4]! + .endif + ldrmi DAT2, [S, #-4]! + stmcsdb D!, {DAT0, DAT1} + strmi DAT2, [D, #-4]! + .else + .if align == 0 + ldmcsia S!, {DAT0, DAT1} + .else + ldrcs DAT0, [S], #4 + ldrcs DAT1, [S], #4 + .endif + ldrmi DAT2, [S], #4 + stmcsia D!, {DAT0, DAT1} + strmi DAT2, [D], #4 + .endif + movs N, N, lsl #2 + .if backwards + ldrcsh DAT0, [S, #-2]! + ldrmib DAT1, [S, #-1] + strcsh DAT0, [D, #-2]! + strmib DAT1, [D, #-1] + .else + ldrcsh DAT0, [S], #2 + ldrmib DAT1, [S] + strcsh DAT0, [D], #2 + strmib DAT1, [D] + .endif +.endm + +.macro memcpy_long_inner_loop backwards, align + .if align != 0 + .if backwards + ldr DAT0, [S, #-align]! + .else + ldr LAST, [S, #-align]! + .endif + .endif +110: + .if align == 0 + .if backwards + ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST} + pld [S, OFF] + stmdb D!, {DAT4, DAT5, DAT6, LAST} + stmdb D!, {DAT0, DAT1, DAT2, DAT3} + .else + ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST} + pld [S, OFF] + stmia D!, {DAT0, DAT1, DAT2, DAT3} + stmia D!, {DAT4, DAT5, DAT6, LAST} + .endif + .else + unaligned_words backwards, align, 1, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST + .endif + subs N, N, #32 + bhs 110b + /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */ + preload_trailing backwards, S, 5, N, OFF + add N, N, #(prefetch_distance+2)*32 - 32 +120: + .if align == 0 + .if backwards + ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST} + stmdb D!, {DAT4, DAT5, DAT6, LAST} + stmdb D!, {DAT0, DAT1, DAT2, DAT3} + .else + ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST} + stmia D!, {DAT0, DAT1, DAT2, DAT3} + stmia D!, {DAT4, DAT5, DAT6, LAST} + .endif + .else + unaligned_words backwards, align, 0, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST + .endif + subs N, N, #32 + bhs 120b + tst N, #16 + .if align == 0 + .if backwards + ldmnedb S!, {DAT0, DAT1, DAT2, LAST} + stmnedb D!, {DAT0, DAT1, DAT2, LAST} + .else + ldmneia S!, {DAT0, DAT1, DAT2, LAST} + stmneia D!, {DAT0, DAT1, DAT2, LAST} + .endif + .else + beq 130f + unaligned_words backwards, align, 0, 4, DAT0, DAT1, DAT2, DAT3, LAST +130: + .endif + /* Trailing words and bytes */ + tst N, #15 + beq 199f + .if align != 0 + add S, S, #align + .endif + memcpy_trailing_15bytes backwards, align +199: + pop {DAT3, DAT4, DAT5, DAT6, DAT7} + pop {D, DAT1, DAT2, pc} +.endm + +.macro memcpy_medium_inner_loop backwards, align +120: + .if backwards + .if align == 0 + ldmdb S!, {DAT0, DAT1, DAT2, LAST} + .else + ldr LAST, [S, #-4]! + ldr DAT2, [S, #-4]! + ldr DAT1, [S, #-4]! + ldr DAT0, [S, #-4]! + .endif + stmdb D!, {DAT0, DAT1, DAT2, LAST} + .else + .if align == 0 + ldmia S!, {DAT0, DAT1, DAT2, LAST} + .else + ldr DAT0, [S], #4 + ldr DAT1, [S], #4 + ldr DAT2, [S], #4 + ldr LAST, [S], #4 + .endif + stmia D!, {DAT0, DAT1, DAT2, LAST} + .endif + subs N, N, #16 + bhs 120b + /* Trailing words and bytes */ + tst N, #15 + beq 199f + memcpy_trailing_15bytes backwards, align +199: + pop {D, DAT1, DAT2, pc} +.endm + +.macro memcpy_short_inner_loop backwards, align + tst N, #16 + .if backwards + .if align == 0 + ldmnedb S!, {DAT0, DAT1, DAT2, LAST} + .else + ldrne LAST, [S, #-4]! + ldrne DAT2, [S, #-4]! + ldrne DAT1, [S, #-4]! + ldrne DAT0, [S, #-4]! + .endif + stmnedb D!, {DAT0, DAT1, DAT2, LAST} + .else + .if align == 0 + ldmneia S!, {DAT0, DAT1, DAT2, LAST} + .else + ldrne DAT0, [S], #4 + ldrne DAT1, [S], #4 + ldrne DAT2, [S], #4 + ldrne LAST, [S], #4 + .endif + stmneia D!, {DAT0, DAT1, DAT2, LAST} + .endif + memcpy_trailing_15bytes backwards, align +199: + pop {D, DAT1, DAT2, pc} +.endm + +.macro memcpy backwards + D .req a1 + S .req a2 + N .req a3 + DAT0 .req a4 + DAT1 .req v1 + DAT2 .req v2 + DAT3 .req v3 + DAT4 .req v4 + DAT5 .req v5 + DAT6 .req v6 + DAT7 .req sl + LAST .req ip + OFF .req lr + + .cfi_startproc + + push {D, DAT1, DAT2, lr} + + .cfi_def_cfa_offset 16 + .cfi_rel_offset D, 0 + .cfi_undefined S + .cfi_undefined N + .cfi_undefined DAT0 + .cfi_rel_offset DAT1, 4 + .cfi_rel_offset DAT2, 8 + .cfi_undefined LAST + .cfi_rel_offset lr, 12 + + .if backwards + add D, D, N + add S, S, N + .endif + + /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */ + cmp N, #31 + blo 170f + /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */ + cmp N, #(prefetch_distance+3)*32 - 1 + blo 160f + + /* Long case */ + push {DAT3, DAT4, DAT5, DAT6, DAT7} + + .cfi_def_cfa_offset 36 + .cfi_rel_offset D, 20 + .cfi_rel_offset DAT1, 24 + .cfi_rel_offset DAT2, 28 + .cfi_rel_offset DAT3, 0 + .cfi_rel_offset DAT4, 4 + .cfi_rel_offset DAT5, 8 + .cfi_rel_offset DAT6, 12 + .cfi_rel_offset DAT7, 16 + .cfi_rel_offset lr, 32 + + /* Adjust N so that the decrement instruction can also test for + * inner loop termination. We want it to stop when there are + * (prefetch_distance+1) complete blocks to go. */ + sub N, N, #(prefetch_distance+2)*32 + preload_leading_step1 backwards, DAT0, S, 5 + .if backwards + /* Bug in GAS: it accepts, but mis-assembles the instruction + * ands DAT2, D, #60, 2 + * which sets DAT2 to the number of leading bytes until destination is aligned and also clears C (sets borrow) + */ + .word 0xE210513C + beq 154f + .else + ands DAT2, D, #15 + beq 154f + rsb DAT2, DAT2, #16 /* number of leading bytes until destination aligned */ + .endif + preload_leading_step2 backwards, DAT0, S, 5, DAT2, OFF + memcpy_leading_15bytes backwards, 1 +154: /* Destination now 16-byte aligned; we have at least one prefetch as well as at least one 16-byte output block */ + /* Prefetch offset is best selected such that it lies in the first 8 of each 32 bytes - but it's just as easy to aim for the first one */ + .if backwards + rsb OFF, S, #3 + and OFF, OFF, #28 + sub OFF, OFF, #32*(prefetch_distance+1) + .else + and OFF, S, #28 + rsb OFF, OFF, #32*prefetch_distance + .endif + movs DAT0, S, lsl #31 + bhi 157f + bcs 156f + bmi 155f + memcpy_long_inner_loop backwards, 0 +155: memcpy_long_inner_loop backwards, 1 +156: memcpy_long_inner_loop backwards, 2 +157: memcpy_long_inner_loop backwards, 3 + + .cfi_def_cfa_offset 16 + .cfi_rel_offset D, 0 + .cfi_rel_offset DAT1, 4 + .cfi_rel_offset DAT2, 8 + .cfi_same_value DAT3 + .cfi_same_value DAT4 + .cfi_same_value DAT5 + .cfi_same_value DAT6 + .cfi_same_value DAT7 + .cfi_rel_offset lr, 12 + +160: /* Medium case */ + preload_all backwards, 0, 0, S, 5, N, DAT2, OFF + sub N, N, #16 /* simplifies inner loop termination */ + .if backwards + ands DAT2, D, #15 + beq 164f + .else + ands DAT2, D, #15 + beq 164f + rsb DAT2, DAT2, #16 + .endif + memcpy_leading_15bytes backwards, align +164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */ + tst S, #3 + bne 140f + memcpy_medium_inner_loop backwards, 0 +140: memcpy_medium_inner_loop backwards, 1 + +170: /* Short case, less than 31 bytes, so no guarantee of at least one 16-byte block */ + teq N, #0 + beq 199f + preload_all backwards, 1, 0, S, 5, N, DAT2, LAST + tst D, #3 + beq 174f +172: subs N, N, #1 + blo 199f + .if backwards + ldrb DAT0, [S, #-1]! + strb DAT0, [D, #-1]! + .else + ldrb DAT0, [S], #1 + strb DAT0, [D], #1 + .endif + tst D, #3 + bne 172b +174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */ + tst S, #3 + bne 140f + memcpy_short_inner_loop backwards, 0 +140: memcpy_short_inner_loop backwards, 1 + + .cfi_endproc + + .unreq D + .unreq S + .unreq N + .unreq DAT0 + .unreq DAT1 + .unreq DAT2 + .unreq DAT3 + .unreq DAT4 + .unreq DAT5 + .unreq DAT6 + .unreq DAT7 + .unreq LAST + .unreq OFF +.endm + +/* + * void *memcpy(void * restrict s1, const void * restrict s2, size_t n); + * On entry: + * a1 = pointer to destination + * a2 = pointer to source + * a3 = number of bytes to copy + * On exit: + * a1 preserved + */ + +.set prefetch_distance, 3 + +myfunc memcpy +1000: memcpy 0 + .size memcpy,.-memcpy +.endfunc + +/* + * void *memmove(void *s1, const void *s2, size_t n); + * On entry: + * a1 = pointer to destination + * a2 = pointer to source + * a3 = number of bytes to copy + * On exit: + * a1 preserved + */ + +.set prefetch_distance, 3 + +myfunc memmove + cmp a2, a1 + bpl 1000b /* pl works even over -1 - 0 and 0x7fffffff - 0x80000000 boundaries */ + memcpy 1 + .size memmove,.-memmove +.endfunc + +/* + * void *mempcpy(void * restrict s1, const void * restrict s2, size_t n); + * On entry: + * a1 = pointer to destination + * a2 = pointer to source + * a3 = number of bytes to copy + * On exit: + * a1 = pointer to immediately after destination block + */ + +myfunc mempcpy +.global __mempcpy +.type __mempcpy STT_FUNC +__mempcpy: + push {v1, lr} + mov v1, a3 + bl 1000b + add a1, a1, v1 + pop {v1, pc} + .size mempcpy,.-mempcpy + .size __mempcpy,.-__mempcpy diff --git a/deps/arm-mem/memcpymove-v7l.S b/deps/arm-mem/memcpymove-v7l.S new file mode 100644 index 00000000..a7a8db9c --- /dev/null +++ b/deps/arm-mem/memcpymove-v7l.S @@ -0,0 +1,659 @@ +/* +Copyright (c) 2015, RISC OS Open Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "arm-mem.h" + +/* Prevent the stack from becoming executable */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + + .text + .fpu neon + .arch armv7a + .object_arch armv4 + .arm + .altmacro + .p2align 2 + +.macro memcpy_leading_63bytes backwards, align + movs TMP, LEAD, lsl #31 + bpl 1f + .if backwards + sub S, S, #1 + sub D, D, #1 + vld1.8 {d7[7]}, [S] + vst1.8 {d7[7]}, [D] + .else + vld1.8 {d7[7]}, [S]! + vst1.8 {d7[7]}, [D]! + .endif +1: bcc 1f + .if backwards + .if align == 0 || align == 2 + sub S, S, #2 + sub D, D, #2 + vld1.16 {d7[3]}, [S :16] + .else + sub S, S, #1 + sub D, D, #2 + vld1.8 {d7[7]}, [S] + sub S, S, #1 + vld1.8 {d7[6]}, [S] + .endif + vst1.16 {d7[3]}, [D :16] + .else + .if align == 0 || align == 2 + vld1.16 {d7[3]}, [S :16]! + .else + vld1.8 {d7[6]}, [S]! + vld1.8 {d7[7]}, [S]! + .endif + vst1.16 {d7[3]}, [D :16]! + .endif +1: + .if align == 0 + movs TMP, LEAD, lsl #29 + .if backwards + vldmdbmi S!, {s13} + vldmdbcs S!, {d7} + vstmdbmi D!, {s13} + vstmdbcs D!, {d7} + .else + vldmiami S!, {s13} + vldmiacs S!, {d7} + vstmiami D!, {s13} + vstmiacs D!, {d7} + .endif + movs TMP, LEAD, lsl #27 + .if backwards + vldmdbmi S!, {d2-d3} + vldmdbcs S!, {d4-d7} + vstmdbmi D!, {d2-d3} + vstmdbcs D!, {d4-d7} + .else + vldmiami S!, {d2-d3} + vldmiacs S!, {d4-d7} + vstmiami D!, {d2-d3} + vstmiacs D!, {d4-d7} + .endif + .else + .if backwards + add S, S, #4-align + vldmdb S!, {s0} + .else + sub S, S, #align + vldmia S!, {s19} + .endif + movs TMP, LEAD, lsl #29 + bpl 1f + .if backwards + vmov s1, s0 + vldmdb S!, {s0} + vext.8 d1, d0, d1, #align + vstmdb D!, {s2} + .else + vmov s18, s19 + vldmia S!, {s19} + vext.8 d8, d9, d10, #align + vstmia D!, {s16} + .endif +1: bcc 1f + .if backwards + vmov s2, s0 + vldmdb S!, {d0} + vext.8 d1, d0, d1, #align + vstmdb D!, {d1} + .else + vmov s17, s19 + vldmia S!, {d9} + vext.8 d8, d8, d9, #4+align + vstmia D!, {d8} + .endif +1: movs TMP, LEAD, lsl #27 + bpl 1f + .if backwards + vmov s4, s0 + vldmdb S!, {d0-d1} + vext.8 q1, q0, q1, #align + vstmdb D!, {d2-d3} + .else + vmov s15, s19 + vldmia S!, {d8-d9} + vext.8 q3, q3, q4, #12+align + vstmia D!, {d6-d7} + .endif +1: bcc 1f + .if backwards + vmov s8, s0 + vldmdb S!, {d0-d3} + vext.8 q2, q1, q2, #align + vext.8 q1, q0, q1, #align + vstmdb D!, {d2-d5} + .else + vmov s11, s19 + vldmia S!, {d6-d9} + vext.8 q2, q2, q3, #12+align + vext.8 q3, q3, q4, #12+align + vstmia D!, {d4-d7} + .endif +1: + .endif +.endm + +.macro memcpy_middle_64bytes backwards, align, use_pld, add_nops + .if align == 0 + .if backwards + vldmdb S!, {d0-d7} + .if use_pld + pld [S, OFF] + .endif + vstmdb D!, {d0-d7} + .else + vldmia S!, {d0-d7} + .if add_nops + .rept 14 + nop + .endr + .endif + .if use_pld + pld [S, OFF] + .endif + vstmia D!, {d0-d7} + .if add_nops + .rept 7 + nop + .endr + .endif + .endif + .else + .if backwards + vmov s16, s0 + vldmdb S!, {d0-d7} + .if use_pld + pld [S, OFF] + .endif + vext.8 q4, q3, q4, #align + vext.8 q3, q2, q3, #align + vext.8 q2, q1, q2, #align + vext.8 q1, q0, q1, #align + vstmdb D!, {d2-d9} + .else + vmov s3, s19 + vldmia S!, {d2-d9} + .if add_nops + .rept 7 + nop + .endr + .endif + .if use_pld + pld [S, OFF] + .endif + vext.8 q0, q0, q1, #12+align + vext.8 q1, q1, q2, #12+align + vext.8 q2, q2, q3, #12+align + vext.8 q3, q3, q4, #12+align + .if add_nops + nop + nop + nop + .endif + vstmia D!, {d0-d7} + .if add_nops + nop + nop + .endif + .endif + .endif +.endm + +.macro memcpy_trailing_63bytes backwards, align + movs TMP, N, lsl #27 + .if align == 0 + .if backwards + vldmdbcs S!, {d4-d7} + vldmdbmi S!, {d2-d3} + vstmdbcs D!, {d4-d7} + vstmdbmi D!, {d2-d3} + .else + vldmiacs S!, {d4-d7} + vldmiami S!, {d2-d3} + vstmiacs D!, {d4-d7} + vstmiami D!, {d2-d3} + .endif + movs TMP, N, lsl #29 + .if backwards + vldmdbcs S!, {d7} + vldmdbmi S!, {s13} + vstmdbcs D!, {d7} + vstmdbmi D!, {s13} + .else + vldmiacs S!, {d7} + vldmiami S!, {s13} + vstmiacs D!, {d7} + vstmiami D!, {s13} + .endif + .else + bcc 1f + .if backwards + vmov s8, s0 + vldmdb S!, {d0-d3} + vext.8 q2, q1, q2, #align + vext.8 q1, q0, q1, #align + vstmdb D!, {d2-d5} + .else + vmov s11, s19 + vldmia S!, {d6-d9} + vext.8 q2, q2, q3, #12+align + vext.8 q3, q3, q4, #12+align + vstmia D!, {d4-d7} + .endif +1: bpl 1f + .if backwards + vmov s4, s0 + vldmdb S!, {d0-d1} + vext.8 q1, q0, q1, #align + vstmdb D!, {d2-d3} + .else + vmov s15, s19 + vldmia S!, {d8-d9} + vext.8 q3, q3, q4, #12+align + vstmia D!, {d6-d7} + .endif +1: movs TMP, N, lsl #29 + bcc 1f + .if backwards + vmov s2, s0 + vldmdb S!, {d0} + vext.8 d1, d0, d1, #align + vstmdb D!, {d1} + .else + vmov s17, s19 + vldmia S!, {d9} + vext.8 d8, d8, d9, #4+align + vstmia D!, {d8} + .endif +1: bpl 1f + .if backwards + vmov s1, s0 + vldmdb S!, {s0} + vext.8 d1, d0, d1, #align + vstmdb D!, {s2} +1: add S, S, #align + .else + vmov s18, s19 + vldmia S!, {s19} + vext.8 d8, d9, d10, #align + vstmia D!, {s16} +1: sub S, S, #4-align + .endif + .endif + movs TMP, N, lsl #31 + bcc 1f + .if backwards + .if align == 0 || align == 2 + sub S, S, #2 + sub D, D, #2 + vld1.16 {d7[3]}, [S :16] + .else + sub S, S, #1 + sub D, D, #2 + vld1.8 {d7[7]}, [S] + sub S, S, #1 + vld1.8 {d7[6]}, [S] + .endif + vst1.16 {d7[3]}, [D :16] + .else + .if align == 0 || align == 2 + vld1.16 {d7[3]}, [S :16]! + .else + vld1.8 {d7[6]}, [S]! + vld1.8 {d7[7]}, [S]! + .endif + vst1.16 {d7[3]}, [D :16]! + .endif +1: bpl 1f + .if backwards + sub S, S, #1 + sub D, D, #1 + vld1.8 {d7[7]}, [S] + vst1.8 {d7[7]}, [D] + .else + vld1.8 {d7[7]}, [S]! + vst1.8 {d7[7]}, [D]! + .endif +1: +.endm + +.macro memcpy_long_inner_loop backwards, align, add_nops + .if backwards + /* Bug in GAS: it accepts, but mis-assembles the instruction + * ands LEAD, D, #252, 2 + * which sets LEAD to the number of leading bytes until destination is aligned and also clears C (sets borrow) + */ + .word 0xE210C1FC + beq 154f + .else + ands LEAD, D, #63 + beq 154f + rsb LEAD, LEAD, #64 /* number of leading bytes until destination aligned */ + .endif + preload_leading_step2 backwards, P, S, 6, LEAD, TMP + memcpy_leading_63bytes backwards, align + sub N, N, LEAD + .if align != 0 + b 155f + .endif +154: + .if align != 0 + .if backwards + add S, S, #4-align + vldmdb S!, {s0} + .else + sub S, S, #align + vldmia S!, {s19} + .endif + .endif +155: /* Destination now 64-byte aligned; we have at least one prefetch as well as at least one 64-byte output block */ + /* Prefetch offset is best selected such that it lies in the first 16 of each 64 bytes - but it's just as easy to aim for the first one */ + .if backwards + rsb OFF, S, #0 + and OFF, OFF, #60 + sub OFF, OFF, #64*(prefetch_distance+1) + .else + and OFF, S, #60 + rsb OFF, OFF, #64*prefetch_distance + .endif +110: memcpy_middle_64bytes backwards, align, 1, add_nops + subs N, N, #64 + bhs 110b + /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */ + preload_trailing backwards, S, 6, N, OFF + add N, N, #(prefetch_distance+2)*64 - 64 +120: memcpy_middle_64bytes backwards, align, 0, add_nops + subs N, N, #64 + bhs 120b + /* Trailing words and bytes */ + tst N, #63 + beq 199f + memcpy_trailing_63bytes backwards, align +199: + vpop {d8-d9} + pop {a1,pc} +.endm + +.macro memcpy_medium_inner_loop backwards, align + .if backwards + ands LEAD, D, #63 + beq 164f + .else + ands LEAD, D, #63 + beq 164f + rsb LEAD, LEAD, #64 + .endif + memcpy_leading_63bytes backwards, align + sub N, N, LEAD + .if align != 0 + b 165f + .endif +164: + .if align != 0 + .if backwards + add S, S, #4-align + vldmdb S!, {s0} + .else + sub S, S, #align + vldmia S!, {s19} + .endif + .endif +165: /* Destination now 64-byte aligned */ + subs N, N, #64 + blo 129f +120: memcpy_middle_64bytes backwards, align, 0, 0 + subs N, N, #64 + bhs 120b +129: /* Trailing words and bytes */ + tst N, #63 + beq 199f + memcpy_trailing_63bytes backwards, align +199: + vpop {d8-d9} + pop {a1,pc} +.endm + +.macro memcpy_short_inner_loop backwards, align + .if align != 0 + .if backwards + add S, S, #4-align + vldmdb S!, {s0} + .else + sub S, S, #align + vldmia S!, {s19} + .endif + .endif + memcpy_trailing_63bytes backwards, align +199: + vpop {d8-d9} + pop {a1,pc} +.endm + +.macro memcpy backwards + D .req a1 + S .req a2 + N .req a3 + P .req a4 + LEAD .req ip + OFF .req ip + TMP .req lr + + .cfi_startproc + + push {a1,lr} + vpush {d8-d9} + + .cfi_def_cfa_offset 16 + .cfi_rel_offset D, 8 + .cfi_undefined S + .cfi_undefined N + .cfi_undefined P + .cfi_undefined LEAD + .cfi_rel_offset lr, 12 + + add ip, D, N + /* See if we cross a 64-byte boundary at the destination */ + .if backwards + /* Also point S and D at the buffer ends if working downwards */ + eor D, ip, D + add S, S, N + bics D, D, #63 + mov D, ip + beq 170f + .else + eor ip, ip, D + bics ip, ip, #63 + beq 170f + .endif + + /* To preload ahead as we go, we need at least (prefetch_distance+2) 64-byte blocks */ + .if prefetch_distance > 1 + movw ip, #(prefetch_distance+3)*64 - 1 + cmp N, ip + .else + cmp N, #(prefetch_distance+3)*64 - 1 + .endif + blo 160f + + .if !backwards + /* If the data is not in the L2 cache, we get up to a 5% speed + * boost by spacing out the instructions with NOPs. Use data + * length to estimate whether this is the case. */ + cmp N, #512*1024 @ L2 cache size for BCM2836 Cortex-A7 + blo 150f + + sub N, N, #(prefetch_distance+2)*64 + preload_leading_step1 backwards, P, S, 6 + + sub TMP, S, D + movs TMP, TMP, lsl #31 + bhi 148f + bcs 147f + bmi 146f + memcpy_long_inner_loop backwards, 0, 1 +146: memcpy_long_inner_loop backwards, 1, 1 +147: memcpy_long_inner_loop backwards, 2, 1 +148: memcpy_long_inner_loop backwards, 3, 1 + .endif + +150: /* Long case */ + /* Adjust N so that the decrement instruction can also test for + * inner loop termination. We want it to stop when there are + * (prefetch_distance+1) complete blocks to go. */ + sub N, N, #(prefetch_distance+2)*64 + preload_leading_step1 backwards, P, S, 6 + + sub TMP, S, D + movs TMP, TMP, lsl #31 + bhi 158f + bcs 157f + bmi 156f + memcpy_long_inner_loop backwards, 0, 0 +156: memcpy_long_inner_loop backwards, 1, 0 +157: memcpy_long_inner_loop backwards, 2, 0 +158: memcpy_long_inner_loop backwards, 3, 0 + +160: /* Medium case */ + preload_all backwards, 0, 0, S, 6, N, OFF, TMP + + sub TMP, S, D + movs TMP, TMP, lsl #31 + bhi 168f + bcs 167f + bmi 166f + memcpy_medium_inner_loop backwards, 0 +166: memcpy_medium_inner_loop backwards, 1 +167: memcpy_medium_inner_loop backwards, 2 +168: memcpy_medium_inner_loop backwards, 3 + +170: /* Short case, less than 127 bytes, so no guarantee of at least one 64-byte block */ + teq N, #0 + beq 199f + preload_all backwards, 1, 0, S, 6, N, OFF, TMP + + tst D, #3 + beq 174f +172: subs N, N, #1 + blo 199f + .if backwards + sub S, S, #1 + sub D, D, #1 + vld1.8 {d7[7]}, [S] + vst1.8 {d7[7]}, [D] + .else + vld1.8 {d7[7]}, [S]! + vst1.8 {d7[7]}, [D]! + .endif + tst D, #3 + bne 172b +174: /* Destination now 4-byte aligned; we have 1 or more output bytes to go */ + sub TMP, S, D + movs TMP, TMP, lsl #31 + bhi 178f + bcs 177f + bmi 176f + memcpy_short_inner_loop backwards, 0 +176: memcpy_short_inner_loop backwards, 1 +177: memcpy_short_inner_loop backwards, 2 +178: memcpy_short_inner_loop backwards, 3 + + .cfi_endproc + + .unreq D + .unreq S + .unreq N + .unreq P + .unreq LEAD + .unreq OFF + .unreq TMP +.endm + +/* + * void *memcpy(void * restrict s1, const void * restrict s2, size_t n); + * On entry: + * a1 = pointer to destination + * a2 = pointer to source + * a3 = number of bytes to copy + * On exit: + * a1 preserved + */ + +.set prefetch_distance, 2 + +myfunc memcpy +1000: memcpy 0 + .size memcpy,.-memcpy +.endfunc + +/* + * void *memmove(void *s1, const void *s2, size_t n); + * On entry: + * a1 = pointer to destination + * a2 = pointer to source + * a3 = number of bytes to copy + * On exit: + * a1 preserved + */ + +.set prefetch_distance, 2 + +myfunc memmove + cmp a2, a1 + bpl 1000b /* pl works even over -1 - 0 and 0x7fffffff - 0x80000000 boundaries */ + memcpy 1 + .size memmove,.-memmove +.endfunc + +/* + * void *mempcpy(void * restrict s1, const void * restrict s2, size_t n); + * On entry: + * a1 = pointer to destination + * a2 = pointer to source + * a3 = number of bytes to copy + * On exit: + * a1 = pointer to immediately after destination block + */ + +myfunc mempcpy +.global __mempcpy +.type __mempcpy STT_FUNC +__mempcpy: + push {v1, lr} + mov v1, a3 + bl 1000b + add a1, a1, v1 + pop {v1, pc} + .size mempcpy,.-mempcpy + .size __mempcpy,.-__mempcpy diff --git a/deps/arm-mem/memset-v6l.S b/deps/arm-mem/memset-v6l.S new file mode 100644 index 00000000..d76ac558 --- /dev/null +++ b/deps/arm-mem/memset-v6l.S @@ -0,0 +1,122 @@ +/* +Copyright (c) 2013, Raspberry Pi Foundation +Copyright (c) 2013, RISC OS Open Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "arm-mem.h" + +/* Prevent the stack from becoming executable */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + + .text + .arch armv6 + .object_arch armv4 + .arm + .altmacro + .p2align 2 + +/* + * void *memset(void *s, int c, size_t n); + * On entry: + * a1 = pointer to buffer to fill + * a2 = byte pattern to fill with (caller-narrowed) + * a3 = number of bytes to fill + * On exit: + * a1 preserved + */ +myfunc memset + S .req a1 + DAT0 .req a2 + N .req a3 + DAT1 .req a4 + DAT2 .req ip + DAT3 .req lr + + and DAT0, DAT0, #255 + push {S, lr} + orr DAT0, DAT0, lsl #8 + orr DAT0, DAT0, lsl #16 + mov DAT1, DAT0 + + /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */ + cmp N, #31 + blo 170f + +161: sub N, N, #16 /* simplifies inner loop termination */ + /* Leading words and bytes */ + tst S, #15 + beq 164f + rsb DAT3, S, #0 /* bits 0-3 = number of leading bytes until aligned */ + movs DAT2, DAT3, lsl #31 + submi N, N, #1 + strmib DAT0, [S], #1 + subcs N, N, #2 + strcsh DAT0, [S], #2 + movs DAT2, DAT3, lsl #29 + submi N, N, #4 + strmi DAT0, [S], #4 + subcs N, N, #8 + stmcsia S!, {DAT0, DAT1} +164: /* Delayed set up of DAT2 and DAT3 so we could use them as scratch registers above */ + mov DAT2, DAT0 + mov DAT3, DAT0 + /* Now the inner loop of 16-byte stores */ +165: stmia S!, {DAT0, DAT1, DAT2, DAT3} + subs N, N, #16 + bhs 165b +166: /* Trailing words and bytes */ + movs N, N, lsl #29 + stmcsia S!, {DAT0, DAT1} + strmi DAT0, [S], #4 + movs N, N, lsl #2 + strcsh DAT0, [S], #2 + strmib DAT0, [S] +199: pop {S, pc} + +170: /* Short case */ + mov DAT2, DAT0 + mov DAT3, DAT0 + tst S, #3 + beq 174f +172: subs N, N, #1 + blo 199b + strb DAT0, [S], #1 + tst S, #3 + bne 172b +174: tst N, #16 + stmneia S!, {DAT0, DAT1, DAT2, DAT3} + b 166b + .size memset,.-memset + + .unreq S + .unreq DAT0 + .unreq N + .unreq DAT1 + .unreq DAT2 + .unreq DAT3 +.endfunc diff --git a/deps/arm-mem/memset-v7l.S b/deps/arm-mem/memset-v7l.S new file mode 100644 index 00000000..0e02b165 --- /dev/null +++ b/deps/arm-mem/memset-v7l.S @@ -0,0 +1,120 @@ +/* +Copyright (c) 2018, RISC OS Open Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "arm-mem.h" + +/* Prevent the stack from becoming executable */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + + .text + .fpu neon + .arch armv7a + .object_arch armv4 + .arm + .altmacro + .p2align 2 + +/* + * void *memset(void *s, int c, size_t n); + * On entry: + * a1 = pointer to buffer to fill + * a2 = byte pattern to fill with (caller-narrowed) + * a3 = number of bytes to fill + * On exit: + * a1 preserved + */ +myfunc memset + SJ .req a2 + N .req a3 + SI .req a4 + OFF .req ip + + mov SI, a1 + vdup.8 q0, a2 + cmp N, #15+64 + vdup.8 q1, a2 + blo 170f + +161: ands ip, a1, #15 + beq 164f + rsb ip, ip, #16 /* number of leading bytes until 16-byte aligned */ + sub N, N, ip + rbit ip, ip + tst a1, #1 /* bit 0 unaffected by rsb so can avoid register interlock */ + strneb a2, [SI], #1 + movs ip, ip, lsl #2 + strcsb a2, [SI, #1] + strcsb a2, [SI], #2 + vstmmi SI!, {s0} + movs ip, ip, lsl #2 + vstmcs SI!, {d0} +164: /* Setup for the inner loop */ + mov OFF, #64 + sub N, N, #64 /* simplifies inner loop termination */ + add SJ, SI, #32 + /* Now the inner loop of 2x32-byte stores */ +165: vst1.8 {q0-q1}, [SI :128], OFF + subs N, N, #64 + vst1.8 {q0-q1}, [SJ :128], OFF + bhs 165b + /* Trailing words and bytes */ +166: vmov.32 a2, d0[0] + movs N, N, lsl #27 + bcc 167f + vst1.8 {q0-q1}, [SI]! +167: bpl 168f + vst1.8 {q0}, [SI]! +168: movs N, N, lsl #2 + vstmcs SI!, {d0} + strmi a2, [SI], #4 + movs N, N, lsl #2 + strcsh a2, [SI], #2 + strmib a2, [SI] +199: bx lr + +170: /* Short case */ + tst SI, #3 + beq 174f +172: subs N, N, #1 + blo 199b + strb a2, [SI], #1 + tst SI, #3 + bne 172b +174: cmp N, #32 + bcc 166b + vst1.8 {q0-q1}, [SI]! + sub N, N, #32 + b 166b + .size memset,.-memset + + .unreq SJ + .unreq N + .unreq SI + .unreq OFF +.endfunc diff --git a/deps/arm-mem/strlen-v7l.S b/deps/arm-mem/strlen-v7l.S new file mode 100644 index 00000000..cb547af9 --- /dev/null +++ b/deps/arm-mem/strlen-v7l.S @@ -0,0 +1,157 @@ +/* +Copyright (c) 2019, RISC OS Open Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "arm-mem.h" + +/* Prevent the stack from becoming executable */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + + .text + .fpu neon + .arch armv7a + .object_arch armv4 + .arm + .altmacro + .p2align 2 + +/* + * size_t strlen (const char *__s); + * On entry: + * a1 = pointer to string + * On exit: + * a1 = length of string, exclusing terminator + */ +myfunc strlen + PTR .req a1 + OPTR .req a2 + MASK .req a3 + TMP0 .req a4 + TMP1 .req v1 + TMP2 .req ip + TMP3 .req lr + + push {v1,lr} + mov OPTR, PTR + movw MASK, #0xff8 + tst PTR, #7 + bne 20f + bics TMP0, MASK, PTR + beq 20f + +10: /* Handle 16 SIMD bytes per iteration until we hit a load that crosses a page boundary */ + /* Loop rotated so that termination test is in what would otherwise be a stall */ + vld1.8 {d0,d1}, [PTR :64]! + bics TMP0, MASK, PTR + beq 12f +11: vceq.i8 d0, #0 + vceq.i8 d1, #0 + vmov TMP0, s0 + vmov TMP1, s1 + vmov TMP2, s2 + vmov TMP3, s3 + teq TMP0, #0 + teqeq TMP1, #0 + teqeq TMP2, #0 + teqeq TMP3, #0 + bne 33f + vld1.8 {d0,d1}, [PTR :64]! + bics TMP0, MASK, PTR + bne 11b +12: vceq.i8 d0, #0 + vceq.i8 d1, #0 + vmov TMP0, s0 + vmov TMP1, s1 + vmov TMP2, s2 + vmov TMP3, s3 + teq TMP0, #0 + teqeq TMP1, #0 + teqeq TMP2, #0 + teqeq TMP3, #0 + bne 33f + /* Drop through... */ + +20: /* Handle one byte per iteration, for leading unaligned bytes or when approaching a page boundary */ + ldrb TMP0, [PTR], #1 +21: tst PTR, #7 + beq 22f + teq TMP0, #0 + beq 23f + ldrb TMP0, [PTR], #1 + b 21b + +22: teq TMP0, #0 + beq 23f + bics TMP0, MASK, PTR + bne 10b + b 20b + +23: /* Terminating null found during single-byte iteration */ + sub a1, PTR, OPTR + sub a1, #1 + pop {v1,pc} + +30: /* Terminating null found within TMP0 during SIMD iteration */ + rev TMP0, TMP0 + clz TMP0, TMP0 + sub a1, PTR, OPTR + sub a1, #16 + add a1, TMP0, lsr #3 + pop {v1,pc} + +31: /* Terminating null found within TMP1 during SIMD iteration */ + rev TMP1, TMP1 + clz TMP1, TMP1 + sub a1, PTR, OPTR + sub a1, #12 + add a1, TMP1, lsr #3 + pop {v1,pc} + +32: /* Terminating null found within TMP2 during SIMD iteration */ + rev TMP2, TMP2 + clz TMP2, TMP2 + sub a1, PTR, OPTR + sub a1, #8 + add a1, TMP2, lsr #3 + pop {v1,pc} + +33: teq TMP0, #0 + bne 30b + teq TMP1, #0 + bne 31b + teq TMP2, #0 + bne 32b + + /* Terminating null found within TMP3 during SIMD iteration */ + rev TMP3, TMP3 + clz TMP3, TMP3 + sub a1, PTR, OPTR + sub a1, #4 + add a1, TMP3, lsr #3 + pop {v1,pc} + .size strlen,.-strlen diff --git a/deps/arm-mem/test-strlen.c b/deps/arm-mem/test-strlen.c new file mode 100644 index 00000000..27699d24 --- /dev/null +++ b/deps/arm-mem/test-strlen.c @@ -0,0 +1,45 @@ +#include +#include +#include +#include +#include + +#include + +//extern size_t mystrlen(const char *s); +//#define strlen mystrlen + +#define PAGESIZE 4096 + +int main(void) +{ + /* To check we don't accidentally read off the end of the string + * across a page boundary, do our tests up to a mapped-out page. + * To check we handle boundaries between valid pages, we require + * two mapped-in pages beforehand. + */ + uint8_t *buffer = mmap(NULL, 3*PAGESIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (buffer == MAP_FAILED) + { + fprintf(stderr, "mmap() failed\n"); + exit(EXIT_FAILURE); + } + if (mprotect(buffer + 2*PAGESIZE, PAGESIZE, PROT_NONE) != 0) + { + perror("mprotect"); + munmap(buffer, 3*PAGESIZE); + exit(EXIT_FAILURE); + } + + for (uint32_t postamble = 0; postamble <= 32; postamble++) + { + memset(buffer, 'x', 2*PAGESIZE); + buffer[2*PAGESIZE - 1 - postamble] = '\0'; + for (uint32_t start = 0; start <= 2*PAGESIZE - 1 - postamble; start++) + assert(strlen(buffer + start) == 2*PAGESIZE - 1 - postamble - start); + } + + printf("strlen passes OK\n"); + munmap(buffer, 3*PAGESIZE); + exit(EXIT_SUCCESS); +} diff --git a/deps/arm-mem/test.c b/deps/arm-mem/test.c new file mode 100644 index 00000000..b52462d2 --- /dev/null +++ b/deps/arm-mem/test.c @@ -0,0 +1,421 @@ +#include +#include +#include +#include +#include +#include + +#define L1CACHESIZE (16*1024) +#define L2CACHESIZE (128*1024) +#define KILOBYTE (1024) +#define MEGABYTE (1024*1024) + +#define TESTSIZE (40*MEGABYTE) + +#define TILEWIDTH (32) +#define TINYWIDTH (8) + +#if 1 +#define CANDIDATE memcpy +#define CANDIDATE_RETURN_TYPE void * +#elif 1 +#define CANDIDATE memset +#define CANDIDATE_RETURN_TYPE void * +#elif 1 +#define CANDIDATE memcmp +#define CANDIDATE_RETURN_TYPE int +#endif + + +/* Just used for cancelling out the overheads */ +static CANDIDATE_RETURN_TYPE control(const void *s1, const void *s2, size_t n) +{ + return 0; +} + +static uint64_t gettime(void) +{ + struct timeval tv; + + gettimeofday (&tv, NULL); + return tv.tv_sec * 1000000 + tv.tv_usec; +} + +static uint32_t bench_L(CANDIDATE_RETURN_TYPE (*test)(), char *a, char *b, size_t len, size_t times) +{ + int i, j, x = 0, q = 0; + volatile int qx; + for (i = times; i >= 0; i--) + { + /* Ensure the destination is in cache (if it gets flushed out, source gets reloaded anyway) */ + for (j = 0; j < len; j += 32) + q += a[j]; + q += a[len-1]; + x = (x + 1) & 63; + test(a + x, b + 63 - x, len); + } + qx = q; + return len * times; +} + +static uint32_t bench_M(CANDIDATE_RETURN_TYPE (*test)(), char *a, char *b, size_t len, size_t times) +{ + int i, x = 0; + for (i = times; i >= 0; i--) + { + x = (x + 1) & 63; + test(a + x, b + 63 - x, len); + } + return len * times; +} + +static uint32_t bench_T(CANDIDATE_RETURN_TYPE (*test)(), char *a, char *b, size_t times) +{ + uint32_t total = 0; + int i, x = 0; + + srand (0); + for (i = times; i >= 0; i--) + { + int w = (rand () % (TILEWIDTH * 2)) + 1; + if (x + w > MEGABYTE) + x = 0; + test(a + x, b + x, w); + x += w; + total += w; + } + return total; +} + +static uint32_t bench_R(CANDIDATE_RETURN_TYPE (*test)(), char *a, char *b, size_t times) +{ + uint32_t total = 0; + int i; + + srand (0); + for (i = times; i >= 0; i--) + { + int w = (rand () % (TILEWIDTH * 2)) + 1; + int ax = (rand() % (MEGABYTE - TILEWIDTH * 2)); + int bx = (rand() % (MEGABYTE - TILEWIDTH * 2)); + test(a + ax, b + bx, w); + total += w; + } + return total; +} + +static uint32_t bench_RW(CANDIDATE_RETURN_TYPE (*test)(), char *a, char *b, size_t w, size_t times) +{ + uint32_t total = 0; + int i; + + srand (0); + for (i = times; i >= 0; i--) + { + int ax = (rand() % (MEGABYTE - 1024)); + int bx = (rand() % (MEGABYTE - 1024)); + test(a + ax, b + bx, w); + total += w; + } + return total; +} + +static uint32_t bench_RT(CANDIDATE_RETURN_TYPE (*test)(), char *a, char *b, size_t times) +{ + uint32_t total = 0; + int i; + + srand (0); + for (i = times; i >= 0; i--) + { + int w = (rand () % (TINYWIDTH * 2)) + 1; + int ax = (rand() % (MEGABYTE - TINYWIDTH * 2)); + int bx = (rand() % (MEGABYTE - TINYWIDTH * 2)); + test(a + ax, b + bx, w); + total += w; + } + return total; +} + +int main(int argc, char *argv[]) +{ + static __attribute__((aligned(32))) char l1bufa[L1CACHESIZE/2-KILOBYTE]; + static __attribute__((aligned(32))) char l1bufb[L1CACHESIZE/2-KILOBYTE]; + static __attribute__((aligned(32))) char l2bufa[L2CACHESIZE/2-KILOBYTE]; + static __attribute__((aligned(32))) char l2bufb[L2CACHESIZE/2-KILOBYTE]; + static __attribute__((aligned(32))) char membufa[MEGABYTE]; + static __attribute__((aligned(32))) char membufb[MEGABYTE]; + size_t s, d, n; + uint64_t t1, t2, t3; + uint32_t byte_cnt; + size_t iterations; + + srand(0); + + if (argc != 2) + { + fprintf(stderr, "Syntax: %s \n", argv[0]); + exit(EXIT_FAILURE); + } + iterations = atoi(argv[1]); + + memset(l1bufa, 0x5A, sizeof l1bufa); + memset(l1bufb, 0x5A, sizeof l1bufb); + memset(l2bufa, 0x5A, sizeof l2bufa); + memset(l2bufb, 0x5A, sizeof l2bufb); + memset(membufa, 0x5A, sizeof membufa); + memset(membufb, 0x5A, sizeof membufb); + + // This code was useful for correctness checking. + // The "my" prefix was used during development to enable the test harness to function + // even when the local implementations were buggy. +#if 0 + void *mymemset(void *s, int c, size_t n); + void *mymemcpy(void * restrict s1, const void * restrict s2, size_t n); + void *mymemmove(void *s1, const void *s2, size_t n); + int mymemcmp(const void *s1, const void *s2, size_t n); + +// These defines are used to prove that the test harness is correct - to test the local +// implementations, comment out the #define +#define mymemset memset +#define mymemcmp memcmp +#define mymemcpy memcpy + /* Check mymemset */ + for (d = 0; d < 64; d++) + { + for (n = 0; n < 192; n++) + { + memset(l1bufa+d, 0xA5, n); + mymemset(l1bufa+d, 0x5A, n); + if (memcmp(l1bufa, l1bufb, sizeof l1bufa) != 0) + { + printf("memset failed (insufficient) with d = %d, n = %d\n", d, n); + for (int x = 0; x < sizeof l1bufa; x++) + if (l1bufa[x] != 0x5A) + printf("Offset %d is wrong\n", x); + } + mymemset(l1bufa+d, 0xA5, n); + memset(l1bufa+d, 0x5A, n); + if (memcmp(l1bufa, l1bufb, sizeof l1bufa) != 0) + { + printf("memset failed (excessive) with d = %d, n = %d\n", d, n); + for (int x = 0; x < sizeof l1bufa; x++) + if (l1bufa[x] != 0x5A) + printf("Offset %d is wrong\n", x); + } + } + } + + /* Check memcmp */ + { +#define SIGNOF(x) (((x)>0)-((x)<0)) + uint32_t a = 0x00010200, b = 0x00020100; + int d1,d2; + if ((d1=SIGNOF(memcmp(l1bufa, l1bufb, sizeof l1bufa))) != (d2=SIGNOF(mymemcmp(l1bufa, l1bufb, sizeof l1bufa)))) + printf("memcmp failed (0: %d %d)\n", d1, d2); + if ((d1=SIGNOF(memcmp(&a, &b, 4))) != (d2=SIGNOF(mymemcmp(&a, &b, 4)))) + printf("memcmp failed (1: %d %d)\n", d1, d2); + if ((d1=SIGNOF(memcmp(&b, &a, 4))) != (d2=SIGNOF(mymemcmp(&b, &a, 4)))) + printf("memcmp failed (2: %d %d)\n", d1, d2); + + /* + for (size_t i = 32-(((int) l1bufa)&31); i < 32-(((int) l1bufa)&31) + 32; i++) + { + for (size_t len = 0; len < 256; len++) + { + mymemcpy(l1bufb+0, l1bufa+i, len); + } + for (size_t len = 0; len < 256; len++) + { + mymemcpy(l1bufb+1, l1bufa+i, len); + } + for (size_t len = 0; len < 256; len++) + { + mymemcpy(l1bufb+2, l1bufa+i, len); + } + for (size_t len = 0; len < 256; len++) + { + mymemcpy(l1bufb+30, l1bufa+i, len); + } + for (size_t len = 0; len < 256; len++) + { + mymemcpy(l1bufb+31, l1bufa+i, len); + } + } + */ + + memset(l2bufa, 0, sizeof l1bufa); + for (size_t i = 0; i < sizeof l1bufa; i += 4) + *(uint32_t*)(l1bufa+i) = rand(); + for (size_t i = 0; i < 64; i++) + { + printf("%u\n", i); + for (size_t j = 0; j < 64; j++) + for (size_t len = 0; len < 2048; len++) + { + int myresult; + int trueresult; + memset(l1bufb, 0, sizeof l1bufb); + mymemcpy(l1bufb+j, l1bufa+i, len); + if (memcmp(l1bufb+j, l1bufa+i, len) != 0) + { + printf("memcpy failed (data: %u %u %u)\n", i, j, len); + printf("should be"); + for (size_t x = 0; x < len; x++) + printf(" %02X%s", l1bufa[i+x] & 0xFF, l1bufa[i+x] != l1bufb[j+x] ? "*" : ""); + printf("\nbut is "); + for (size_t x = 0; x < len; x++) + printf(" %02X%s", l1bufb[j+x] & 0xFF, l1bufa[i+x] != l1bufb[j+x] ? "*" : ""); + printf("\n"); + } + else if ((myresult = mymemcmp(l1bufb+j, l1bufa+i, len)) != 0) + { + printf("memcmp failed (%u %u %u) was %08x (%c0), should be =0\n", i, j, len, myresult, "<=>"[SIGNOF(myresult) + 1]); + myresult = mymemcmp(l1bufb+j, l1bufa+i, len); + } + for (size_t k = 0; k + 1 < len && k + 1 < 20; k++) + { + size_t k2 = len - 2 - k; + l1bufb[j+k] ^= 0x80; + l1bufb[j+k+1] ^= 0x80; + + myresult = mymemcmp(l1bufb+j, l1bufa+i, len); + trueresult = memcmp(l1bufb+j, l1bufa+i, len); + if (SIGNOF(myresult) != SIGNOF(trueresult)) + { + printf("memcmp failed (%u %u %u with diff at %u was %08x (%c0), should be %c0\n", + i, j, len, k, + myresult, + "<=>"[SIGNOF(myresult) + 1], + "<=>"[SIGNOF(trueresult) + 1]); + myresult = mymemcmp(l1bufb+j, l1bufa+i, len); + } + l1bufb[j+k] ^= 0x80; + l1bufb[j+k+1] ^= 0x80; + l1bufb[j+k2] ^= 0x80; + l1bufb[j+k2+1] ^= 0x80; + myresult = mymemcmp(l1bufb+j, l1bufa+i, len); + trueresult = memcmp(l1bufb+j, l1bufa+i, len); + if (SIGNOF(myresult) != SIGNOF(trueresult)) + { + printf("memcmp failed (%u %u %u with diff at %u was %08x (%c0), should be %c0\n", + i, j, len, k2, + myresult, + "<=>"[SIGNOF(myresult) + 1], + "<=>"[SIGNOF(trueresult) + 1]); + myresult = mymemcmp(l1bufb+j, l1bufa+i, len); + } + l1bufb[j+k2] ^= 0x80; + l1bufb[j+k2+1] ^= 0x80; + } + if (memcmp(l1bufb, l2bufa, j) != 0) + printf("memcpy failed (before: %u %u %u)\n", i, j, len); + if (memcmp(l1bufb+j+len, l2bufa, sizeof l1bufa -j-len) != 0) + printf("memcpy failed (after: %u %u %u)\n", i, j, len); + } + } + } +#endif + + // This code is for benchmarking +#if 1 + printf("L1, L2, M, T, R, RT\n"); + + while (iterations--) + { + memcpy(l1bufa, l1bufb, sizeof l1bufa); + memcpy(l1bufb, l1bufa, sizeof l1bufa); + + t1 = gettime(); + bench_L(control, l1bufa, l1bufb, sizeof l1bufa - 64, TESTSIZE / (sizeof l1bufa - 64)); + t2 = gettime(); + byte_cnt = bench_L(CANDIDATE, l1bufa, l1bufb, sizeof l1bufa - 64, TESTSIZE / (sizeof l1bufa - 64)); + t3 = gettime(); + printf("%6.2f, ", ((double)byte_cnt) / ((t3 - t2) - (t2 - t1))); + fflush(stdout); + + memcpy(l2bufa, l2bufb, sizeof l2bufa); + memcpy(l2bufb, l2bufa, sizeof l2bufa); + + t1 = gettime(); + bench_L(control, l2bufa, l2bufb, sizeof l2bufa - 64, TESTSIZE / (sizeof l2bufa - 64)); + t2 = gettime(); + byte_cnt = bench_L(CANDIDATE, l2bufa, l2bufb, sizeof l2bufa - 64, TESTSIZE / (sizeof l2bufa - 64)); + t3 = gettime(); + printf("%6.2f, ", ((double)byte_cnt) / ((t3 - t2) - (t2 - t1))); + fflush(stdout); + + memcpy(membufa, membufb, sizeof membufa); + memcpy(membufb, membufa, sizeof membufa); + + t1 = gettime(); + bench_M(control, membufa, membufb, sizeof membufa - 64, TESTSIZE / (sizeof membufa - 64)); + t2 = gettime(); + byte_cnt = bench_M(CANDIDATE, membufa, membufb, sizeof membufa - 64, TESTSIZE / (sizeof membufa - 64)); + t3 = gettime(); + printf("%6.2f, ", ((double)byte_cnt) / ((t3 - t2) - (t2 - t1))); + fflush(stdout); + + memcpy(membufa, membufb, sizeof membufa); + memcpy(membufb, membufa, sizeof membufa); + + t1 = gettime(); + bench_T(control, membufa, membufb, TESTSIZE / (TILEWIDTH*2)); + t2 = gettime(); + byte_cnt = bench_T(CANDIDATE, membufa, membufb, TESTSIZE / (TILEWIDTH*2)); + t3 = gettime(); + printf("%6.2f, ", ((double)byte_cnt) / ((t3 - t2) - (t2 - t1))); + fflush(stdout); + + memcpy(membufa, membufb, sizeof membufa); + memcpy(membufb, membufa, sizeof membufa); + + t1 = gettime(); + bench_R(control, membufa, membufb, TESTSIZE / (TILEWIDTH*2)); + t2 = gettime(); + byte_cnt = bench_R(CANDIDATE, membufa, membufb, TESTSIZE / (TILEWIDTH*2)); + t3 = gettime(); + printf("%6.2f, ", ((double)byte_cnt) / ((t3 - t2) - (t2 - t1))); + fflush(stdout); + + memcpy(membufa, membufb, sizeof membufa); + memcpy(membufb, membufa, sizeof membufa); + + t1 = gettime(); + bench_RT(control, membufa, membufb, TESTSIZE / (TILEWIDTH*2)); + t2 = gettime(); + byte_cnt = bench_RT(CANDIDATE, membufa, membufb, TESTSIZE / (TILEWIDTH*2)); + t3 = gettime(); + printf("%6.2f\n", ((double)byte_cnt) / ((t3 - t2) - (t2 - t1))); + fflush(stdout); + } +#elif 0 + const char *sep = ""; + for (int w = 1; w <= 100; w++) + { + printf("%sW%d", sep, w); + sep = ", "; + } + printf("\n"); + + while (iterations--) + { + sep = ""; + for (int w = 1; w <= 100; w++) + { + memcpy(membufa, membufb, sizeof membufa); + memcpy(membufb, membufa, sizeof membufa); + + t1 = gettime(); + bench_RW(control, membufa, membufb, w, TESTSIZE / w); + t2 = gettime(); + byte_cnt = bench_RW(CANDIDATE, membufa, membufb, w, TESTSIZE / w); + t3 = gettime(); + printf("%s%6.2f", sep, ((double)byte_cnt) / ((t3 - t2) - (t2 - t1))); + sep = ", "; + fflush(stdout); + } + printf("\n"); + } +#endif +} -- 2.39.5