Stephen Hines | c6ca60f | 2023-05-09 02:19:22 -0700 | [diff] [blame^] | 1 | /*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------=== |
| 2 | * |
| 3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | * See https://llvm.org/LICENSE.txt for license information. |
| 5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | * |
| 7 | *===-----------------------------------------------------------------------=== |
| 8 | */ |
| 9 | |
| 10 | #if !defined X86GPRINTRIN_H_ |
| 11 | #error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead." |
| 12 | #endif |
| 13 | |
| 14 | #ifndef BMI2INTRIN_H_ |
| 15 | #define BMI2INTRIN_H_ |
| 16 | |
| 17 | extern __inline unsigned int |
| 18 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| 19 | _bzhi_u32(unsigned int __X, unsigned int __Y) { |
| 20 | return ((__X << (32 - __Y)) >> (32 - __Y)); |
| 21 | } |
| 22 | |
| 23 | extern __inline unsigned int |
| 24 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| 25 | _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) { |
| 26 | unsigned long long __res = (unsigned long long)__X * __Y; |
| 27 | *__P = (unsigned int)(__res >> 32); |
| 28 | return (unsigned int)__res; |
| 29 | } |
| 30 | |
| 31 | #ifdef __PPC64__ |
| 32 | extern __inline unsigned long long |
| 33 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| 34 | _bzhi_u64(unsigned long long __X, unsigned long long __Y) { |
| 35 | return ((__X << (64 - __Y)) >> (64 - __Y)); |
| 36 | } |
| 37 | |
| 38 | /* __int128 requires base 64-bit. */ |
| 39 | extern __inline unsigned long long |
| 40 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| 41 | _mulx_u64(unsigned long long __X, unsigned long long __Y, |
| 42 | unsigned long long *__P) { |
| 43 | unsigned __int128 __res = (unsigned __int128)__X * __Y; |
| 44 | *__P = (unsigned long long)(__res >> 64); |
| 45 | return (unsigned long long)__res; |
| 46 | } |
| 47 | |
| 48 | #ifdef _ARCH_PWR7 |
| 49 | /* popcount and bpermd require power7 minimum. */ |
| 50 | extern __inline unsigned long long |
| 51 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| 52 | _pdep_u64(unsigned long long __X, unsigned long long __M) { |
| 53 | unsigned long __result = 0x0UL; |
| 54 | const unsigned long __mask = 0x8000000000000000UL; |
| 55 | unsigned long __m = __M; |
| 56 | unsigned long __c, __t; |
| 57 | unsigned long __p; |
| 58 | |
| 59 | /* The pop-count of the mask gives the number of the bits from |
| 60 | source to process. This is also needed to shift bits from the |
| 61 | source into the correct position for the result. */ |
| 62 | __p = 64 - __builtin_popcountl(__M); |
| 63 | |
| 64 | /* The loop is for the number of '1' bits in the mask and clearing |
| 65 | each mask bit as it is processed. */ |
| 66 | while (__m != 0) { |
| 67 | __c = __builtin_clzl(__m); |
| 68 | __t = __X << (__p - __c); |
| 69 | __m ^= (__mask >> __c); |
| 70 | __result |= (__t & (__mask >> __c)); |
| 71 | __p++; |
| 72 | } |
| 73 | return __result; |
| 74 | } |
| 75 | |
| 76 | extern __inline unsigned long long |
| 77 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| 78 | _pext_u64(unsigned long long __X, unsigned long long __M) { |
| 79 | unsigned long __p = 0x4040404040404040UL; // initial bit permute control |
| 80 | const unsigned long __mask = 0x8000000000000000UL; |
| 81 | unsigned long __m = __M; |
| 82 | unsigned long __c; |
| 83 | unsigned long __result; |
| 84 | |
| 85 | /* if the mask is constant and selects 8 bits or less we can use |
| 86 | the Power8 Bit permute instruction. */ |
| 87 | if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) { |
| 88 | /* Also if the pext mask is constant, then the popcount is |
| 89 | constant, we can evaluate the following loop at compile |
| 90 | time and use a constant bit permute vector. */ |
| 91 | long __i; |
| 92 | for (__i = 0; __i < __builtin_popcountl(__M); __i++) { |
| 93 | __c = __builtin_clzl(__m); |
| 94 | __p = (__p << 8) | __c; |
| 95 | __m ^= (__mask >> __c); |
| 96 | } |
| 97 | __result = __builtin_bpermd(__p, __X); |
| 98 | } else { |
| 99 | __p = 64 - __builtin_popcountl(__M); |
| 100 | __result = 0; |
| 101 | /* We could a use a for loop here, but that combined with |
| 102 | -funroll-loops can expand to a lot of code. The while |
| 103 | loop avoids unrolling and the compiler commons the xor |
| 104 | from clearing the mask bit with the (m != 0) test. The |
| 105 | result is a more compact loop setup and body. */ |
| 106 | while (__m != 0) { |
| 107 | unsigned long __t; |
| 108 | __c = __builtin_clzl(__m); |
| 109 | __t = (__X & (__mask >> __c)) >> (__p - __c); |
| 110 | __m ^= (__mask >> __c); |
| 111 | __result |= (__t); |
| 112 | __p++; |
| 113 | } |
| 114 | } |
| 115 | return __result; |
| 116 | } |
| 117 | |
| 118 | /* these 32-bit implementations depend on 64-bit pdep/pext |
| 119 | which depend on _ARCH_PWR7. */ |
| 120 | extern __inline unsigned int |
| 121 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| 122 | _pdep_u32(unsigned int __X, unsigned int __Y) { |
| 123 | return _pdep_u64(__X, __Y); |
| 124 | } |
| 125 | |
| 126 | extern __inline unsigned int |
| 127 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| 128 | _pext_u32(unsigned int __X, unsigned int __Y) { |
| 129 | return _pext_u64(__X, __Y); |
| 130 | } |
| 131 | #endif /* _ARCH_PWR7 */ |
| 132 | #endif /* __PPC64__ */ |
| 133 | |
| 134 | #endif /* BMI2INTRIN_H_ */ |