| /* memcmp - compare memory |
| * |
| * Copyright (c) 2013-2022, Arm Limited. |
| * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception |
| */ |
| |
| /* Assumptions: |
| * |
| * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. |
| */ |
| |
| #include "asmdefs.h" |
| |
| #define src1 x0 |
| #define src2 x1 |
| #define limit x2 |
| #define result w0 |
| |
| #define data1 x3 |
| #define data1w w3 |
| #define data2 x4 |
| #define data2w w4 |
| #define data3 x5 |
| #define data3w w5 |
| #define data4 x6 |
| #define data4w w6 |
| #define tmp x6 |
| #define src1end x7 |
| #define src2end x8 |
| |
| |
| ENTRY (__memcmp_aarch64) |
| PTR_ARG (0) |
| PTR_ARG (1) |
| SIZE_ARG (2) |
| |
| cmp limit, 16 |
| b.lo L(less16) |
| ldp data1, data3, [src1] |
| ldp data2, data4, [src2] |
| ccmp data1, data2, 0, ne |
| ccmp data3, data4, 0, eq |
| b.ne L(return2) |
| |
| add src1end, src1, limit |
| add src2end, src2, limit |
| cmp limit, 32 |
| b.ls L(last_bytes) |
| cmp limit, 160 |
| b.hs L(loop_align) |
| sub limit, limit, 32 |
| |
| .p2align 4 |
| L(loop32): |
| ldp data1, data3, [src1, 16] |
| ldp data2, data4, [src2, 16] |
| cmp data1, data2 |
| ccmp data3, data4, 0, eq |
| b.ne L(return2) |
| cmp limit, 16 |
| b.ls L(last_bytes) |
| |
| ldp data1, data3, [src1, 32] |
| ldp data2, data4, [src2, 32] |
| cmp data1, data2 |
| ccmp data3, data4, 0, eq |
| b.ne L(return2) |
| add src1, src1, 32 |
| add src2, src2, 32 |
| L(last64): |
| subs limit, limit, 32 |
| b.hi L(loop32) |
| |
| /* Compare last 1-16 bytes using unaligned access. */ |
| L(last_bytes): |
| ldp data1, data3, [src1end, -16] |
| ldp data2, data4, [src2end, -16] |
| L(return2): |
| cmp data1, data2 |
| csel data1, data1, data3, ne |
| csel data2, data2, data4, ne |
| |
| /* Compare data bytes and set return value to 0, -1 or 1. */ |
| L(return): |
| #ifndef __AARCH64EB__ |
| rev data1, data1 |
| rev data2, data2 |
| #endif |
| cmp data1, data2 |
| cset result, ne |
| cneg result, result, lo |
| ret |
| |
| .p2align 4 |
| L(less16): |
| add src1end, src1, limit |
| add src2end, src2, limit |
| tbz limit, 3, L(less8) |
| ldr data1, [src1] |
| ldr data2, [src2] |
| ldr data3, [src1end, -8] |
| ldr data4, [src2end, -8] |
| b L(return2) |
| |
| .p2align 4 |
| L(less8): |
| tbz limit, 2, L(less4) |
| ldr data1w, [src1] |
| ldr data2w, [src2] |
| ldr data3w, [src1end, -4] |
| ldr data4w, [src2end, -4] |
| b L(return2) |
| |
| L(less4): |
| tbz limit, 1, L(less2) |
| ldrh data1w, [src1] |
| ldrh data2w, [src2] |
| cmp data1w, data2w |
| b.ne L(return) |
| L(less2): |
| mov result, 0 |
| tbz limit, 0, L(return_zero) |
| ldrb data1w, [src1end, -1] |
| ldrb data2w, [src2end, -1] |
| sub result, data1w, data2w |
| L(return_zero): |
| ret |
| |
| L(loop_align): |
| ldp data1, data3, [src1, 16] |
| ldp data2, data4, [src2, 16] |
| cmp data1, data2 |
| ccmp data3, data4, 0, eq |
| b.ne L(return2) |
| |
| /* Align src2 and adjust src1, src2 and limit. */ |
| and tmp, src2, 15 |
| sub tmp, tmp, 16 |
| sub src2, src2, tmp |
| add limit, limit, tmp |
| sub src1, src1, tmp |
| sub limit, limit, 64 + 16 |
| |
| .p2align 4 |
| L(loop64): |
| ldr q0, [src1, 16] |
| ldr q1, [src2, 16] |
| subs limit, limit, 64 |
| ldr q2, [src1, 32] |
| ldr q3, [src2, 32] |
| eor v0.16b, v0.16b, v1.16b |
| eor v1.16b, v2.16b, v3.16b |
| ldr q2, [src1, 48] |
| ldr q3, [src2, 48] |
| umaxp v0.16b, v0.16b, v1.16b |
| ldr q4, [src1, 64]! |
| ldr q5, [src2, 64]! |
| eor v1.16b, v2.16b, v3.16b |
| eor v2.16b, v4.16b, v5.16b |
| umaxp v1.16b, v1.16b, v2.16b |
| umaxp v0.16b, v0.16b, v1.16b |
| umaxp v0.16b, v0.16b, v0.16b |
| fmov tmp, d0 |
| ccmp tmp, 0, 0, hi |
| b.eq L(loop64) |
| |
| /* If equal, process last 1-64 bytes using scalar loop. */ |
| add limit, limit, 64 + 16 |
| cbz tmp, L(last64) |
| |
| /* Determine the 8-byte aligned offset of the first difference. */ |
| #ifdef __AARCH64EB__ |
| rev16 tmp, tmp |
| #endif |
| rev tmp, tmp |
| clz tmp, tmp |
| bic tmp, tmp, 7 |
| sub tmp, tmp, 48 |
| ldr data1, [src1, tmp] |
| ldr data2, [src2, tmp] |
| #ifndef __AARCH64EB__ |
| rev data1, data1 |
| rev data2, data2 |
| #endif |
| mov result, 1 |
| cmp data1, data2 |
| cneg result, result, lo |
| ret |
| |
| END (__memcmp_aarch64) |