blob: dfcdc8a8cc9d234cac3190758f7dcca3030ce2c9 [file] [log] [blame]
/* crc32_fold_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation.
* Copyright Wangyang Guo (wangyang.guo@intel.com)
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef X86_VPCLMULQDQ_CRC
#include "../../zbuild.h"
#include "../../fallback_builtins.h"
#include <immintrin.h>
#define ONCE(op) if (first) { \
first = 0; \
(op); \
}
#define XOR_INITIAL(where) ONCE(where = _mm512_xor_si512(where, zmm_initial))
size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) {
size_t len_tmp = len;
__m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
__m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
__m512i z0, z1, z2, z3;
const __m512i zmm_fold4 = _mm512_set4_epi32(
0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
const __m512i zmm_fold16 = _mm512_set4_epi32(
0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
// zmm register init
zmm_crc0 = _mm512_setzero_si512();
zmm_t0 = _mm512_loadu_si512((__m512i *)src);
zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
/* already have intermediate CRC in xmm registers
* fold4 with 4 xmm_crc to get zmm_crc0
*/
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0);
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1);
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2);
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
_mm512_storeu_si512((__m512i *)dst, zmm_t0);
_mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1);
_mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2);
_mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3);
len -= 256;
src += 256;
dst += 256;
// fold-16 loops
while (len >= 256) {
zmm_t0 = _mm512_loadu_si512((__m512i *)src);
zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01);
z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01);
z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01);
z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10);
zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10);
zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10);
zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10);
zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
zmm_crc1 = _mm512_xor_si512(z1, zmm_crc1);
zmm_crc2 = _mm512_xor_si512(z2, zmm_crc2);
zmm_crc3 = _mm512_xor_si512(z3, zmm_crc3);
zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
zmm_crc1 = _mm512_xor_si512(zmm_crc1, zmm_t1);
zmm_crc2 = _mm512_xor_si512(zmm_crc2, zmm_t2);
zmm_crc3 = _mm512_xor_si512(zmm_crc3, zmm_t3);
_mm512_storeu_si512((__m512i *)dst, zmm_t0);
_mm512_storeu_si512((__m512i *)dst + 1, zmm_t1);
_mm512_storeu_si512((__m512i *)dst + 2, zmm_t2);
_mm512_storeu_si512((__m512i *)dst + 3, zmm_t3);
len -= 256;
src += 256;
dst += 256;
}
// zmm_crc[0,1,2,3] -> zmm_crc0
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc1);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc2);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc3);
// zmm_crc0 -> xmm_crc[0, 1, 2, 3]
*xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);
*xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1);
*xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2);
*xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3);
return (len_tmp - len); // return n bytes processed
}
size_t fold_16_vpclmulqdq_nocp(__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len,
__m128i init_crc, int32_t first) {
size_t len_tmp = len;
__m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
__m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
__m512i z0, z1, z2, z3;
__m512i zmm_initial = _mm512_zextsi128_si512(init_crc);
const __m512i zmm_fold4 = _mm512_set4_epi32(
0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
const __m512i zmm_fold16 = _mm512_set4_epi32(
0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
// zmm register init
zmm_crc0 = _mm512_setzero_si512();
zmm_t0 = _mm512_loadu_si512((__m512i *)src);
XOR_INITIAL(zmm_t0);
zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
/* already have intermediate CRC in xmm registers
* fold4 with 4 xmm_crc to get zmm_crc0
*/
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0);
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1);
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2);
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
len -= 256;
src += 256;
// fold-16 loops
while (len >= 256) {
zmm_t0 = _mm512_loadu_si512((__m512i *)src);
zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01);
z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01);
z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01);
z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10);
zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10);
zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10);
zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10);
zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
zmm_crc1 = _mm512_xor_si512(z1, zmm_crc1);
zmm_crc2 = _mm512_xor_si512(z2, zmm_crc2);
zmm_crc3 = _mm512_xor_si512(z3, zmm_crc3);
zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
zmm_crc1 = _mm512_xor_si512(zmm_crc1, zmm_t1);
zmm_crc2 = _mm512_xor_si512(zmm_crc2, zmm_t2);
zmm_crc3 = _mm512_xor_si512(zmm_crc3, zmm_t3);
len -= 256;
src += 256;
}
// zmm_crc[0,1,2,3] -> zmm_crc0
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc1);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc2);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc3);
// zmm_crc0 -> xmm_crc[0, 1, 2, 3]
*xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);
*xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1);
*xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2);
*xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3);
return (len_tmp - len); // return n bytes processed
}
#endif