string: Improve SVE memcpy

Improve SVE memcpy by copying 2 vectors. This avoids a check on vector length
and improves performance of random memcpy.
diff --git a/string/aarch64/memcpy-sve.S b/string/aarch64/memcpy-sve.S
index f74d4a9..61d36f2 100644
--- a/string/aarch64/memcpy-sve.S
+++ b/string/aarch64/memcpy-sve.S
@@ -57,14 +57,16 @@
 
 	cmp	count, 128
 	b.hi	L(copy_long)
-	cmp	count, 32
+	cntb	vlen
+	cmp	count, vlen, lsl 1
 	b.hi	L(copy32_128)
 
 	whilelo p0.b, xzr, count
-	cntb	vlen
-	tbnz	vlen, 4, L(vlen128)
-	ld1b	z0.b, p0/z, [src]
-	st1b	z0.b, p0, [dstin]
+	whilelo p1.b, vlen, count
+	ld1b	z0.b, p0/z, [src, 0, mul vl]
+	ld1b	z1.b, p1/z, [src, 1, mul vl]
+	st1b	z0.b, p0, [dstin, 0, mul vl]
+	st1b	z1.b, p1, [dstin, 1, mul vl]
 	ret
 
 	/* Medium copies: 33..128 bytes.  */
@@ -133,14 +135,6 @@
 	stp	A_q, B_q, [dstend, -32]
 	ret
 
-L(vlen128):
-	whilelo p1.b, vlen, count
-	ld1b	z0.b, p0/z, [src, 0, mul vl]
-	ld1b	z1.b, p1/z, [src, 1, mul vl]
-	st1b	z0.b, p0, [dstin, 0, mul vl]
-	st1b	z1.b, p1, [dstin, 1, mul vl]
-	ret
-
 	/* Large backwards copy for overlapping copies.
 	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
 L(copy_long_backwards):