string: Improve SVE memcpy Improve SVE memcpy by copying 2 vectors. This avoids a check on vector length and improves performance of random memcpy.

commit: 92864946def299cab01582672cb7510b2e996101 [log] [tgz]
author: Wilco Dijkstra <wilco.dijkstra@arm.com> Tue Jan 24 11:56:38 2023 +0000
committer: Szabolcs Nagy <szabolcs.nagy@arm.com> Tue Jan 24 11:56:38 2023 +0000
tree: 19b2f01d66a1cfa7cad08e7f19b038aec930ff98
parent: a7b6022090c9b43af519dc328c1aeece7258e558 [diff]
diff --git a/string/aarch64/memcpy-sve.S b/string/aarch64/memcpy-sve.S
index f74d4a9..61d36f2 100644
--- a/string/aarch64/memcpy-sve.S
+++ b/string/aarch64/memcpy-sve.S

@@ -57,14 +57,16 @@
 
 	cmp	count, 128
 	b.hi	L(copy_long)
-	cmp	count, 32
+	cntb	vlen
+	cmp	count, vlen, lsl 1
 	b.hi	L(copy32_128)
 
 	whilelo p0.b, xzr, count
-	cntb	vlen
-	tbnz	vlen, 4, L(vlen128)
-	ld1b	z0.b, p0/z, [src]
-	st1b	z0.b, p0, [dstin]
+	whilelo p1.b, vlen, count
+	ld1b	z0.b, p0/z, [src, 0, mul vl]
+	ld1b	z1.b, p1/z, [src, 1, mul vl]
+	st1b	z0.b, p0, [dstin, 0, mul vl]
+	st1b	z1.b, p1, [dstin, 1, mul vl]
 	ret
 
 	/* Medium copies: 33..128 bytes.  */
@@ -133,14 +135,6 @@
 	stp	A_q, B_q, [dstend, -32]
 	ret
 
-L(vlen128):
-	whilelo p1.b, vlen, count
-	ld1b	z0.b, p0/z, [src, 0, mul vl]
-	ld1b	z1.b, p1/z, [src, 1, mul vl]
-	st1b	z0.b, p0, [dstin, 0, mul vl]
-	st1b	z1.b, p1, [dstin, 1, mul vl]
-	ret
-
 	/* Large backwards copy for overlapping copies.
 	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
 L(copy_long_backwards):
commit	92864946def299cab01582672cb7510b2e996101	[log] [tgz]
author	Wilco Dijkstra <wilco.dijkstra@arm.com>	Tue Jan 24 11:56:38 2023 +0000
committer	Szabolcs Nagy <szabolcs.nagy@arm.com>	Tue Jan 24 11:56:38 2023 +0000
tree	19b2f01d66a1cfa7cad08e7f19b038aec930ff98
parent	a7b6022090c9b43af519dc328c1aeece7258e558 [diff]