Blame - renderscript/clang-include/avxintrin.h - platform/prebuilts/fullsdk-linux/build-tools/29.0.3

blob: 86bfdfb80c796a74d9f83bdf05b13484c108b51e [file] [log] [blame]

Louis Pullen-Freilich	2d89707	2020-01-30 14:59:03 +0000	[diff] [blame]	1	/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
				2	*
				3	* Permission is hereby granted, free of charge, to any person obtaining a copy
				4	* of this software and associated documentation files (the "Software"), to deal
				5	* in the Software without restriction, including without limitation the rights
				6	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
				7	* copies of the Software, and to permit persons to whom the Software is
				8	* furnished to do so, subject to the following conditions:
				9	*
				10	* The above copyright notice and this permission notice shall be included in
				11	* all copies or substantial portions of the Software.
				12	*
				13	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				14	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				15	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				16	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				17	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				18	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
				19	* THE SOFTWARE.
				20	*
				21	*===-----------------------------------------------------------------------===
				22	*/
				23
				24	#ifndef __IMMINTRIN_H
				25	#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
				26	#endif
				27
				28	#ifndef __AVXINTRIN_H
				29	#define __AVXINTRIN_H
				30
				31	typedef double __v4df __attribute__ ((__vector_size__ (32)));
				32	typedef float __v8sf __attribute__ ((__vector_size__ (32)));
				33	typedef long long __v4di __attribute__ ((__vector_size__ (32)));
				34	typedef int __v8si __attribute__ ((__vector_size__ (32)));
				35	typedef short __v16hi __attribute__ ((__vector_size__ (32)));
				36	typedef char __v32qi __attribute__ ((__vector_size__ (32)));
				37
				38	/* Unsigned types */
				39	typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
				40	typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
				41	typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
				42	typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
				43
				44	/* We need an explicitly signed variant for char. Note that this shouldn't
				45	* appear in the interface though. */
				46	typedef signed char __v32qs __attribute__((__vector_size__(32)));
				47
				48	typedef float __m256 __attribute__ ((__vector_size__ (32)));
				49	typedef double __m256d __attribute__((__vector_size__(32)));
				50	typedef long long __m256i __attribute__((__vector_size__(32)));
				51
				52	/* Define the default attributes for the functions in this file. */
				53	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
				54
				55	/* Arithmetic */
				56	/// \brief Adds two 256-bit vectors of [4 x double].
				57	///
				58	/// \headerfile <x86intrin.h>
				59	///
				60	/// This intrinsic corresponds to the \c VADDPD / ADDPD instruction.
				61	///
				62	/// \param __a
				63	/// A 256-bit vector of [4 x double] containing one of the source operands.
				64	/// \param __b
				65	/// A 256-bit vector of [4 x double] containing one of the source operands.
				66	/// \returns A 256-bit vector of [4 x double] containing the sums of both
				67	/// operands.
				68	static __inline __m256d __DEFAULT_FN_ATTRS
				69	_mm256_add_pd(__m256d __a, __m256d __b)
				70	{
				71	return (__m256d)((__v4df)__a+(__v4df)__b);
				72	}
				73
				74	/// \brief Adds two 256-bit vectors of [8 x float].
				75	///
				76	/// \headerfile <x86intrin.h>
				77	///
				78	/// This intrinsic corresponds to the \c VADDPS / ADDPS instruction.
				79	///
				80	/// \param __a
				81	/// A 256-bit vector of [8 x float] containing one of the source operands.
				82	/// \param __b
				83	/// A 256-bit vector of [8 x float] containing one of the source operands.
				84	/// \returns A 256-bit vector of [8 x float] containing the sums of both
				85	/// operands.
				86	static __inline __m256 __DEFAULT_FN_ATTRS
				87	_mm256_add_ps(__m256 __a, __m256 __b)
				88	{
				89	return (__m256)((__v8sf)__a+(__v8sf)__b);
				90	}
				91
				92	/// \brief Subtracts two 256-bit vectors of [4 x double].
				93	///
				94	/// \headerfile <x86intrin.h>
				95	///
				96	/// This intrinsic corresponds to the \c VSUBPD / SUBPD instruction.
				97	///
				98	/// \param __a
				99	/// A 256-bit vector of [4 x double] containing the minuend.
				100	/// \param __b
				101	/// A 256-bit vector of [4 x double] containing the subtrahend.
				102	/// \returns A 256-bit vector of [4 x double] containing the differences between
				103	/// both operands.
				104	static __inline __m256d __DEFAULT_FN_ATTRS
				105	_mm256_sub_pd(__m256d __a, __m256d __b)
				106	{
				107	return (__m256d)((__v4df)__a-(__v4df)__b);
				108	}
				109
				110	/// \brief Subtracts two 256-bit vectors of [8 x float].
				111	///
				112	/// \headerfile <x86intrin.h>
				113	///
				114	/// This intrinsic corresponds to the \c VSUBPS / SUBPS instruction.
				115	///
				116	/// \param __a
				117	/// A 256-bit vector of [8 x float] containing the minuend.
				118	/// \param __b
				119	/// A 256-bit vector of [8 x float] containing the subtrahend.
				120	/// \returns A 256-bit vector of [8 x float] containing the differences between
				121	/// both operands.
				122	static __inline __m256 __DEFAULT_FN_ATTRS
				123	_mm256_sub_ps(__m256 __a, __m256 __b)
				124	{
				125	return (__m256)((__v8sf)__a-(__v8sf)__b);
				126	}
				127
				128	/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
				129	/// two 256-bit vectors of [4 x double].
				130	///
				131	/// \headerfile <x86intrin.h>
				132	///
				133	/// This intrinsic corresponds to the \c VADDSUBPD / ADDSUBPD instruction.
				134	///
				135	/// \param __a
				136	/// A 256-bit vector of [4 x double] containing the left source operand.
				137	/// \param __b
				138	/// A 256-bit vector of [4 x double] containing the right source operand.
				139	/// \returns A 256-bit vector of [4 x double] containing the alternating sums
				140	/// and differences between both operands.
				141	static __inline __m256d __DEFAULT_FN_ATTRS
				142	_mm256_addsub_pd(__m256d __a, __m256d __b)
				143	{
				144	return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
				145	}
				146
				147	/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
				148	/// two 256-bit vectors of [8 x float].
				149	///
				150	/// \headerfile <x86intrin.h>
				151	///
				152	/// This intrinsic corresponds to the \c VADDSUBPS / ADDSUBPS instruction.
				153	///
				154	/// \param __a
				155	/// A 256-bit vector of [8 x float] containing the left source operand.
				156	/// \param __b
				157	/// A 256-bit vector of [8 x float] containing the right source operand.
				158	/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
				159	/// differences between both operands.
				160	static __inline __m256 __DEFAULT_FN_ATTRS
				161	_mm256_addsub_ps(__m256 __a, __m256 __b)
				162	{
				163	return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
				164	}
				165
				166	/// \brief Divides two 256-bit vectors of [4 x double].
				167	///
				168	/// \headerfile <x86intrin.h>
				169	///
				170	/// This intrinsic corresponds to the \c VDIVPD / DIVPD instruction.
				171	///
				172	/// \param __a
				173	/// A 256-bit vector of [4 x double] containing the dividend.
				174	/// \param __b
				175	/// A 256-bit vector of [4 x double] containing the divisor.
				176	/// \returns A 256-bit vector of [4 x double] containing the quotients of both
				177	/// operands.
				178	static __inline __m256d __DEFAULT_FN_ATTRS
				179	_mm256_div_pd(__m256d __a, __m256d __b)
				180	{
				181	return (__m256d)((__v4df)__a/(__v4df)__b);
				182	}
				183
				184	/// \brief Divides two 256-bit vectors of [8 x float].
				185	///
				186	/// \headerfile <x86intrin.h>
				187	///
				188	/// This intrinsic corresponds to the \c VDIVPS / DIVPS instruction.
				189	///
				190	/// \param __a
				191	/// A 256-bit vector of [8 x float] containing the dividend.
				192	/// \param __b
				193	/// A 256-bit vector of [8 x float] containing the divisor.
				194	/// \returns A 256-bit vector of [8 x float] containing the quotients of both
				195	/// operands.
				196	static __inline __m256 __DEFAULT_FN_ATTRS
				197	_mm256_div_ps(__m256 __a, __m256 __b)
				198	{
				199	return (__m256)((__v8sf)__a/(__v8sf)__b);
				200	}
				201
				202	/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater
				203	/// of each pair of values.
				204	///
				205	/// \headerfile <x86intrin.h>
				206	///
				207	/// This intrinsic corresponds to the \c VMAXPD / MAXPD instruction.
				208	///
				209	/// \param __a
				210	/// A 256-bit vector of [4 x double] containing one of the operands.
				211	/// \param __b
				212	/// A 256-bit vector of [4 x double] containing one of the operands.
				213	/// \returns A 256-bit vector of [4 x double] containing the maximum values
				214	/// between both operands.
				215	static __inline __m256d __DEFAULT_FN_ATTRS
				216	_mm256_max_pd(__m256d __a, __m256d __b)
				217	{
				218	return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
				219	}
				220
				221	/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater
				222	/// of each pair of values.
				223	///
				224	/// \headerfile <x86intrin.h>
				225	///
				226	/// This intrinsic corresponds to the \c VMAXPS / MAXPS instruction.
				227	///
				228	/// \param __a
				229	/// A 256-bit vector of [8 x float] containing one of the operands.
				230	/// \param __b
				231	/// A 256-bit vector of [8 x float] containing one of the operands.
				232	/// \returns A 256-bit vector of [8 x float] containing the maximum values
				233	/// between both operands.
				234	static __inline __m256 __DEFAULT_FN_ATTRS
				235	_mm256_max_ps(__m256 __a, __m256 __b)
				236	{
				237	return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
				238	}
				239
				240	/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser
				241	/// of each pair of values.
				242	///
				243	/// \headerfile <x86intrin.h>
				244	///
				245	/// This intrinsic corresponds to the \c VMINPD / MINPD instruction.
				246	///
				247	/// \param __a
				248	/// A 256-bit vector of [4 x double] containing one of the operands.
				249	/// \param __b
				250	/// A 256-bit vector of [4 x double] containing one of the operands.
				251	/// \returns A 256-bit vector of [4 x double] containing the minimum values
				252	/// between both operands.
				253	static __inline __m256d __DEFAULT_FN_ATTRS
				254	_mm256_min_pd(__m256d __a, __m256d __b)
				255	{
				256	return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
				257	}
				258
				259	/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser
				260	/// of each pair of values.
				261	///
				262	/// \headerfile <x86intrin.h>
				263	///
				264	/// This intrinsic corresponds to the \c VMINPS / MINPS instruction.
				265	///
				266	/// \param __a
				267	/// A 256-bit vector of [8 x float] containing one of the operands.
				268	/// \param __b
				269	/// A 256-bit vector of [8 x float] containing one of the operands.
				270	/// \returns A 256-bit vector of [8 x float] containing the minimum values
				271	/// between both operands.
				272	static __inline __m256 __DEFAULT_FN_ATTRS
				273	_mm256_min_ps(__m256 __a, __m256 __b)
				274	{
				275	return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
				276	}
				277
				278	/// \brief Multiplies two 256-bit vectors of [4 x double].
				279	///
				280	/// \headerfile <x86intrin.h>
				281	///
				282	/// This intrinsic corresponds to the \c VMULPD / MULPD instruction.
				283	///
				284	/// \param __a
				285	/// A 256-bit vector of [4 x double] containing one of the operands.
				286	/// \param __b
				287	/// A 256-bit vector of [4 x double] containing one of the operands.
				288	/// \returns A 256-bit vector of [4 x double] containing the products of both
				289	/// operands.
				290	static __inline __m256d __DEFAULT_FN_ATTRS
				291	_mm256_mul_pd(__m256d __a, __m256d __b)
				292	{
				293	return (__m256d)((__v4df)__a * (__v4df)__b);
				294	}
				295
				296	/// \brief Multiplies two 256-bit vectors of [8 x float].
				297	///
				298	/// \headerfile <x86intrin.h>
				299	///
				300	/// This intrinsic corresponds to the \c VMULPS / MULPS instruction.
				301	///
				302	/// \param __a
				303	/// A 256-bit vector of [8 x float] containing one of the operands.
				304	/// \param __b
				305	/// A 256-bit vector of [8 x float] containing one of the operands.
				306	/// \returns A 256-bit vector of [8 x float] containing the products of both
				307	/// operands.
				308	static __inline __m256 __DEFAULT_FN_ATTRS
				309	_mm256_mul_ps(__m256 __a, __m256 __b)
				310	{
				311	return (__m256)((__v8sf)__a * (__v8sf)__b);
				312	}
				313
				314	/// \brief Calculates the square roots of the values in a 256-bit vector of
				315	/// [4 x double].
				316	///
				317	/// \headerfile <x86intrin.h>
				318	///
				319	/// This intrinsic corresponds to the \c VSQRTPD / SQRTPD instruction.
				320	///
				321	/// \param __a
				322	/// A 256-bit vector of [4 x double].
				323	/// \returns A 256-bit vector of [4 x double] containing the square roots of the
				324	/// values in the operand.
				325	static __inline __m256d __DEFAULT_FN_ATTRS
				326	_mm256_sqrt_pd(__m256d __a)
				327	{
				328	return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
				329	}
				330
				331	/// \brief Calculates the square roots of the values in a 256-bit vector of
				332	/// [8 x float].
				333	///
				334	/// \headerfile <x86intrin.h>
				335	///
				336	/// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instruction.
				337	///
				338	/// \param __a
				339	/// A 256-bit vector of [8 x float].
				340	/// \returns A 256-bit vector of [8 x float] containing the square roots of the
				341	/// values in the operand.
				342	static __inline __m256 __DEFAULT_FN_ATTRS
				343	_mm256_sqrt_ps(__m256 __a)
				344	{
				345	return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
				346	}
				347
				348	/// \brief Calculates the reciprocal square roots of the values in a 256-bit
				349	/// vector of [8 x float].
				350	///
				351	/// \headerfile <x86intrin.h>
				352	///
				353	/// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instruction.
				354	///
				355	/// \param __a
				356	/// A 256-bit vector of [8 x float].
				357	/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
				358	/// roots of the values in the operand.
				359	static __inline __m256 __DEFAULT_FN_ATTRS
				360	_mm256_rsqrt_ps(__m256 __a)
				361	{
				362	return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
				363	}
				364
				365	/// \brief Calculates the reciprocals of the values in a 256-bit vector of
				366	/// [8 x float].
				367	///
				368	/// \headerfile <x86intrin.h>
				369	///
				370	/// This intrinsic corresponds to the \c VRCPPS / RCPPS instruction.
				371	///
				372	/// \param __a
				373	/// A 256-bit vector of [8 x float].
				374	/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
				375	/// values in the operand.
				376	static __inline __m256 __DEFAULT_FN_ATTRS
				377	_mm256_rcp_ps(__m256 __a)
				378	{
				379	return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
				380	}
				381
				382	/// \brief Rounds the values in a 256-bit vector of [4 x double] as specified
				383	/// by the byte operand. The source values are rounded to integer values and
				384	/// returned as 64-bit double-precision floating-point values.
				385	///
				386	/// \headerfile <x86intrin.h>
				387	///
				388	/// \code
				389	/// __m256d _mm256_round_pd(__m256d V, const int M);
				390	/// \endcode
				391	///
				392	/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
				393	///
				394	/// \param V
				395	/// A 256-bit vector of [4 x double].
				396	/// \param M
				397	/// An integer value that specifies the rounding operation.
				398	/// Bits [7:4] are reserved.
				399	/// Bit [3] is a precision exception value:
				400	/// 0: A normal PE exception is used.
				401	/// 1: The PE field is not updated.
				402	/// Bit [2] is the rounding control source:
				403	/// 0: Use bits [1:0] of M.
				404	/// 1: Use the current MXCSR setting.
				405	/// Bits [1:0] contain the rounding control definition:
				406	/// 00: Nearest.
				407	/// 01: Downward (toward negative infinity).
				408	/// 10: Upward (toward positive infinity).
				409	/// 11: Truncated.
				410	/// \returns A 256-bit vector of [4 x double] containing the rounded values.
				411	#define _mm256_round_pd(V, M) __extension__ ({ \
				412	(__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
				413
				414	/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as
				415	/// specified by the byte operand. The source values are rounded to integer
				416	/// values and returned as floating-point values.
				417	///
				418	/// \headerfile <x86intrin.h>
				419	///
				420	/// \code
				421	/// __m256 _mm256_round_ps(__m256 V, const int M);
				422	/// \endcode
				423	///
				424	/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
				425	///
				426	/// \param V
				427	/// A 256-bit vector of [8 x float].
				428	/// \param M
				429	/// An integer value that specifies the rounding operation.
				430	/// Bits [7:4] are reserved.
				431	/// Bit [3] is a precision exception value:
				432	/// 0: A normal PE exception is used.
				433	/// 1: The PE field is not updated.
				434	/// Bit [2] is the rounding control source:
				435	/// 0: Use bits [1:0] of M.
				436	/// 1: Use the current MXCSR setting.
				437	/// Bits [1:0] contain the rounding control definition:
				438	/// 00: Nearest.
				439	/// 01: Downward (toward negative infinity).
				440	/// 10: Upward (toward positive infinity).
				441	/// 11: Truncated.
				442	/// \returns A 256-bit vector of [8 x float] containing the rounded values.
				443	#define _mm256_round_ps(V, M) __extension__ ({ \
				444	(__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
				445
				446	/// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The
				447	/// source values are rounded up to integer values and returned as 64-bit
				448	/// double-precision floating-point values.
				449	///
				450	/// \headerfile <x86intrin.h>
				451	///
				452	/// \code
				453	/// __m256d _mm256_ceil_pd(__m256d V);
				454	/// \endcode
				455	///
				456	/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
				457	///
				458	/// \param V
				459	/// A 256-bit vector of [4 x double].
				460	/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
				461	#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
				462
				463	/// \brief Rounds down the values stored in a 256-bit vector of [4 x double].
				464	/// The source values are rounded down to integer values and returned as
				465	/// 64-bit double-precision floating-point values.
				466	///
				467	/// \headerfile <x86intrin.h>
				468	///
				469	/// \code
				470	/// __m256d _mm256_floor_pd(__m256d V);
				471	/// \endcode
				472	///
				473	/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
				474	///
				475	/// \param V
				476	/// A 256-bit vector of [4 x double].
				477	/// \returns A 256-bit vector of [4 x double] containing the rounded down
				478	/// values.
				479	#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
				480
				481	/// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The
				482	/// source values are rounded up to integer values and returned as
				483	/// floating-point values.
				484	///
				485	/// \headerfile <x86intrin.h>
				486	///
				487	/// \code
				488	/// __m256 _mm256_ceil_ps(__m256 V);
				489	/// \endcode
				490	///
				491	/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
				492	///
				493	/// \param V
				494	/// A 256-bit vector of [8 x float].
				495	/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
				496	#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
				497
				498	/// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The
				499	/// source values are rounded down to integer values and returned as
				500	/// floating-point values.
				501	///
				502	/// \headerfile <x86intrin.h>
				503	///
				504	/// \code
				505	/// __m256 _mm256_floor_ps(__m256 V);
				506	/// \endcode
				507	///
				508	/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
				509	///
				510	/// \param V
				511	/// A 256-bit vector of [8 x float].
				512	/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
				513	#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
				514
				515	/* Logical */
				516	/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double].
				517	///
				518	/// \headerfile <x86intrin.h>
				519	///
				520	/// This intrinsic corresponds to the \c VANDPD / ANDPD instruction.
				521	///
				522	/// \param __a
				523	/// A 256-bit vector of [4 x double] containing one of the source operands.
				524	/// \param __b
				525	/// A 256-bit vector of [4 x double] containing one of the source operands.
				526	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
				527	/// values between both operands.
				528	static __inline __m256d __DEFAULT_FN_ATTRS
				529	_mm256_and_pd(__m256d __a, __m256d __b)
				530	{
				531	return (__m256d)((__v4du)__a & (__v4du)__b);
				532	}
				533
				534	/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float].
				535	///
				536	/// \headerfile <x86intrin.h>
				537	///
				538	/// This intrinsic corresponds to the \c VANDPS / ANDPS instruction.
				539	///
				540	/// \param __a
				541	/// A 256-bit vector of [8 x float] containing one of the source operands.
				542	/// \param __b
				543	/// A 256-bit vector of [8 x float] containing one of the source operands.
				544	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
				545	/// values between both operands.
				546	static __inline __m256 __DEFAULT_FN_ATTRS
				547	_mm256_and_ps(__m256 __a, __m256 __b)
				548	{
				549	return (__m256)((__v8su)__a & (__v8su)__b);
				550	}
				551
				552	/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using
				553	/// the one's complement of the values contained in the first source operand.
				554	///
				555	/// \headerfile <x86intrin.h>
				556	///
				557	/// This intrinsic corresponds to the \c VANDNPD / ANDNPD instruction.
				558	///
				559	/// \param __a
				560	/// A 256-bit vector of [4 x double] containing the left source operand. The
				561	/// one's complement of this value is used in the bitwise AND.
				562	/// \param __b
				563	/// A 256-bit vector of [4 x double] containing the right source operand.
				564	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
				565	/// values of the second operand and the one's complement of the first
				566	/// operand.
				567	static __inline __m256d __DEFAULT_FN_ATTRS
				568	_mm256_andnot_pd(__m256d __a, __m256d __b)
				569	{
				570	return (__m256d)(~(__v4du)__a & (__v4du)__b);
				571	}
				572
				573	/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using
				574	/// the one's complement of the values contained in the first source operand.
				575	///
				576	/// \headerfile <x86intrin.h>
				577	///
				578	/// This intrinsic corresponds to the \c VANDNPS / ANDNPS instruction.
				579	///
				580	/// \param __a
				581	/// A 256-bit vector of [8 x float] containing the left source operand. The
				582	/// one's complement of this value is used in the bitwise AND.
				583	/// \param __b
				584	/// A 256-bit vector of [8 x float] containing the right source operand.
				585	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
				586	/// values of the second operand and the one's complement of the first
				587	/// operand.
				588	static __inline __m256 __DEFAULT_FN_ATTRS
				589	_mm256_andnot_ps(__m256 __a, __m256 __b)
				590	{
				591	return (__m256)(~(__v8su)__a & (__v8su)__b);
				592	}
				593
				594	/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double].
				595	///
				596	/// \headerfile <x86intrin.h>
				597	///
				598	/// This intrinsic corresponds to the \c VORPD / ORPD instruction.
				599	///
				600	/// \param __a
				601	/// A 256-bit vector of [4 x double] containing one of the source operands.
				602	/// \param __b
				603	/// A 256-bit vector of [4 x double] containing one of the source operands.
				604	/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
				605	/// values between both operands.
				606	static __inline __m256d __DEFAULT_FN_ATTRS
				607	_mm256_or_pd(__m256d __a, __m256d __b)
				608	{
				609	return (__m256d)((__v4du)__a \| (__v4du)__b);
				610	}
				611
				612	/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float].
				613	///
				614	/// \headerfile <x86intrin.h>
				615	///
				616	/// This intrinsic corresponds to the \c VORPS / ORPS instruction.
				617	///
				618	/// \param __a
				619	/// A 256-bit vector of [8 x float] containing one of the source operands.
				620	/// \param __b
				621	/// A 256-bit vector of [8 x float] containing one of the source operands.
				622	/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
				623	/// values between both operands.
				624	static __inline __m256 __DEFAULT_FN_ATTRS
				625	_mm256_or_ps(__m256 __a, __m256 __b)
				626	{
				627	return (__m256)((__v8su)__a \| (__v8su)__b);
				628	}
				629
				630	/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double].
				631	///
				632	/// \headerfile <x86intrin.h>
				633	///
				634	/// This intrinsic corresponds to the \c VXORPD / XORPD instruction.
				635	///
				636	/// \param __a
				637	/// A 256-bit vector of [4 x double] containing one of the source operands.
				638	/// \param __b
				639	/// A 256-bit vector of [4 x double] containing one of the source operands.
				640	/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
				641	/// values between both operands.
				642	static __inline __m256d __DEFAULT_FN_ATTRS
				643	_mm256_xor_pd(__m256d __a, __m256d __b)
				644	{
				645	return (__m256d)((__v4du)__a ^ (__v4du)__b);
				646	}
				647
				648	/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float].
				649	///
				650	/// \headerfile <x86intrin.h>
				651	///
				652	/// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
				653	///
				654	/// \param __a
				655	/// A 256-bit vector of [8 x float] containing one of the source operands.
				656	/// \param __b
				657	/// A 256-bit vector of [8 x float] containing one of the source operands.
				658	/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
				659	/// values between both operands.
				660	static __inline __m256 __DEFAULT_FN_ATTRS
				661	_mm256_xor_ps(__m256 __a, __m256 __b)
				662	{
				663	return (__m256)((__v8su)__a ^ (__v8su)__b);
				664	}
				665
				666	/* Horizontal arithmetic */
				667	/// \brief Horizontally adds the adjacent pairs of values contained in two
				668	/// 256-bit vectors of [4 x double].
				669	///
				670	/// \headerfile <x86intrin.h>
				671	///
				672	/// This intrinsic corresponds to the \c VHADDPD / HADDPD instruction.
				673	///
				674	/// \param __a
				675	/// A 256-bit vector of [4 x double] containing one of the source operands.
				676	/// The horizontal sums of the values are returned in the even-indexed
				677	/// elements of a vector of [4 x double].
				678	/// \param __b
				679	/// A 256-bit vector of [4 x double] containing one of the source operands.
				680	/// The horizontal sums of the values are returned in the odd-indexed
				681	/// elements of a vector of [4 x double].
				682	/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
				683	/// both operands.
				684	static __inline __m256d __DEFAULT_FN_ATTRS
				685	_mm256_hadd_pd(__m256d __a, __m256d __b)
				686	{
				687	return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
				688	}
				689
				690	/// \brief Horizontally adds the adjacent pairs of values contained in two
				691	/// 256-bit vectors of [8 x float].
				692	///
				693	/// \headerfile <x86intrin.h>
				694	///
				695	/// This intrinsic corresponds to the \c VHADDPS / HADDPS instruction.
				696	///
				697	/// \param __a
				698	/// A 256-bit vector of [8 x float] containing one of the source operands.
				699	/// The horizontal sums of the values are returned in the elements with
				700	/// index 0, 1, 4, 5 of a vector of [8 x float].
				701	/// \param __b
				702	/// A 256-bit vector of [8 x float] containing one of the source operands.
				703	/// The horizontal sums of the values are returned in the elements with
				704	/// index 2, 3, 6, 7 of a vector of [8 x float].
				705	/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
				706	/// both operands.
				707	static __inline __m256 __DEFAULT_FN_ATTRS
				708	_mm256_hadd_ps(__m256 __a, __m256 __b)
				709	{
				710	return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
				711	}
				712
				713	/// \brief Horizontally subtracts the adjacent pairs of values contained in two
				714	/// 256-bit vectors of [4 x double].
				715	///
				716	/// \headerfile <x86intrin.h>
				717	///
				718	/// This intrinsic corresponds to the \c VHSUBPD / HSUBPD instruction.
				719	///
				720	/// \param __a
				721	/// A 256-bit vector of [4 x double] containing one of the source operands.
				722	/// The horizontal differences between the values are returned in the
				723	/// even-indexed elements of a vector of [4 x double].
				724	/// \param __b
				725	/// A 256-bit vector of [4 x double] containing one of the source operands.
				726	/// The horizontal differences between the values are returned in the
				727	/// odd-indexed elements of a vector of [4 x double].
				728	/// \returns A 256-bit vector of [4 x double] containing the horizontal
				729	/// differences of both operands.
				730	static __inline __m256d __DEFAULT_FN_ATTRS
				731	_mm256_hsub_pd(__m256d __a, __m256d __b)
				732	{
				733	return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
				734	}
				735
				736	/// \brief Horizontally subtracts the adjacent pairs of values contained in two
				737	/// 256-bit vectors of [8 x float].
				738	///
				739	/// \headerfile <x86intrin.h>
				740	///
				741	/// This intrinsic corresponds to the \c VHSUBPS / HSUBPS instruction.
				742	///
				743	/// \param __a
				744	/// A 256-bit vector of [8 x float] containing one of the source operands.
				745	/// The horizontal differences between the values are returned in the
				746	/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
				747	/// \param __b
				748	/// A 256-bit vector of [8 x float] containing one of the source operands.
				749	/// The horizontal differences between the values are returned in the
				750	/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
				751	/// \returns A 256-bit vector of [8 x float] containing the horizontal
				752	/// differences of both operands.
				753	static __inline __m256 __DEFAULT_FN_ATTRS
				754	_mm256_hsub_ps(__m256 __a, __m256 __b)
				755	{
				756	return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
				757	}
				758
				759	/* Vector permutations */
				760	/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
				761	/// by the 128-bit integer vector operand.
				762	///
				763	/// \headerfile <x86intrin.h>
				764	///
				765	/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
				766	///
				767	/// \param __a
				768	/// A 128-bit vector of [2 x double].
				769	/// \param __c
				770	/// A 128-bit integer vector operand specifying how the values are to be
				771	/// copied.
				772	/// Bit [1]:
				773	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
				774	/// returned vector.
				775	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
				776	/// returned vector.
				777	/// Bit [65]:
				778	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
				779	/// returned vector.
				780	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
				781	/// returned vector.
				782	/// \returns A 128-bit vector of [2 x double] containing the copied values.
				783	static __inline __m128d __DEFAULT_FN_ATTRS
				784	_mm_permutevar_pd(__m128d __a, __m128i __c)
				785	{
				786	return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
				787	}
				788
				789	/// \brief Copies the values in a 256-bit vector of [4 x double] as
				790	/// specified by the 256-bit integer vector operand.
				791	///
				792	/// \headerfile <x86intrin.h>
				793	///
				794	/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
				795	///
				796	/// \param __a
				797	/// A 256-bit vector of [4 x double].
				798	/// \param __c
				799	/// A 256-bit integer vector operand specifying how the values are to be
				800	/// copied.
				801	/// Bit [1]:
				802	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
				803	/// returned vector.
				804	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
				805	/// returned vector.
				806	/// Bit [65]:
				807	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
				808	/// returned vector.
				809	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
				810	/// returned vector.
				811	/// Bit [129]:
				812	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
				813	/// returned vector.
				814	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
				815	/// returned vector.
				816	/// Bit [193]:
				817	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
				818	/// returned vector.
				819	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
				820	/// returned vector.
				821	/// \returns A 256-bit vector of [4 x double] containing the copied values.
				822	static __inline __m256d __DEFAULT_FN_ATTRS
				823	_mm256_permutevar_pd(__m256d __a, __m256i __c)
				824	{
				825	return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
				826	}
				827
				828	/// \brief Copies the values stored in a 128-bit vector of [4 x float] as
				829	/// specified by the 128-bit integer vector operand.
				830	///
				831	/// \headerfile <x86intrin.h>
				832	///
				833	/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
				834	///
				835	/// \param __a
				836	/// A 128-bit vector of [4 x float].
				837	/// \param __c
				838	/// A 128-bit integer vector operand specifying how the values are to be
				839	/// copied.
				840	/// Bits [1:0]:
				841	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
				842	/// returned vector.
				843	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
				844	/// returned vector.
				845	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
				846	/// returned vector.
				847	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
				848	/// returned vector.
				849	/// Bits [33:32]:
				850	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
				851	/// returned vector.
				852	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
				853	/// returned vector.
				854	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
				855	/// returned vector.
				856	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
				857	/// returned vector.
				858	/// Bits [65:64]:
				859	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
				860	/// returned vector.
				861	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
				862	/// returned vector.
				863	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
				864	/// returned vector.
				865	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
				866	/// returned vector.
				867	/// Bits [97:96]:
				868	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
				869	/// returned vector.
				870	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
				871	/// returned vector.
				872	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
				873	/// returned vector.
				874	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
				875	/// returned vector.
				876	/// \returns A 128-bit vector of [4 x float] containing the copied values.
				877	static __inline __m128 __DEFAULT_FN_ATTRS
				878	_mm_permutevar_ps(__m128 __a, __m128i __c)
				879	{
				880	return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
				881	}
				882
				883	/// \brief Copies the values stored in a 256-bit vector of [8 x float] as
				884	/// specified by the 256-bit integer vector operand.
				885	///
				886	/// \headerfile <x86intrin.h>
				887	///
				888	/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
				889	///
				890	/// \param __a
				891	/// A 256-bit vector of [8 x float].
				892	/// \param __c
				893	/// A 256-bit integer vector operand specifying how the values are to be
				894	/// copied.
				895	/// Bits [1:0]:
				896	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
				897	/// returned vector.
				898	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
				899	/// returned vector.
				900	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
				901	/// returned vector.
				902	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
				903	/// returned vector.
				904	/// Bits [33:32]:
				905	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
				906	/// returned vector.
				907	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
				908	/// returned vector.
				909	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
				910	/// returned vector.
				911	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
				912	/// returned vector.
				913	/// Bits [65:64]:
				914	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
				915	/// returned vector.
				916	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
				917	/// returned vector.
				918	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
				919	/// returned vector.
				920	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
				921	/// returned vector.
				922	/// Bits [97:96]:
				923	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
				924	/// returned vector.
				925	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
				926	/// returned vector.
				927	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
				928	/// returned vector.
				929	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
				930	/// returned vector.
				931	/// Bits [129:128]:
				932	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
				933	/// returned vector.
				934	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
				935	/// returned vector.
				936	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
				937	/// returned vector.
				938	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
				939	/// returned vector.
				940	/// Bits [161:160]:
				941	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
				942	/// returned vector.
				943	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
				944	/// returned vector.
				945	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
				946	/// returned vector.
				947	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
				948	/// returned vector.
				949	/// Bits [193:192]:
				950	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
				951	/// returned vector.
				952	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
				953	/// returned vector.
				954	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
				955	/// returned vector.
				956	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
				957	/// returned vector.
				958	/// Bits [225:224]:
				959	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
				960	/// returned vector.
				961	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
				962	/// returned vector.
				963	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
				964	/// returned vector.
				965	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
				966	/// returned vector.
				967	/// \returns A 256-bit vector of [8 x float] containing the copied values.
				968	static __inline __m256 __DEFAULT_FN_ATTRS
				969	_mm256_permutevar_ps(__m256 __a, __m256i __c)
				970	{
				971	return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
				972	}
				973
				974	/// \brief Copies the values in a 128-bit vector of [2 x double] as
				975	/// specified by the immediate integer operand.
				976	///
				977	/// \headerfile <x86intrin.h>
				978	///
				979	/// \code
				980	/// __m128d _mm_permute_pd(__m128d A, const int C);
				981	/// \endcode
				982	///
				983	/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
				984	///
				985	/// \param A
				986	/// A 128-bit vector of [2 x double].
				987	/// \param C
				988	/// An immediate integer operand specifying how the values are to be copied.
				989	/// Bit [0]:
				990	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
				991	/// returned vector.
				992	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
				993	/// returned vector.
				994	/// Bit [1]:
				995	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
				996	/// returned vector.
				997	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
				998	/// returned vector.
				999	/// \returns A 128-bit vector of [2 x double] containing the copied values.
				1000	#define _mm_permute_pd(A, C) __extension__ ({ \
				1001	(__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
				1002	(__v2df)_mm_undefined_pd(), \
				1003	((C) >> 0) & 0x1, ((C) >> 1) & 0x1); })
				1004
				1005	/// \brief Copies the values in a 256-bit vector of [4 x double] as
				1006	/// specified by the immediate integer operand.
				1007	///
				1008	/// \headerfile <x86intrin.h>
				1009	///
				1010	/// \code
				1011	/// __m256d _mm256_permute_pd(__m256d A, const int C);
				1012	/// \endcode
				1013	///
				1014	/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
				1015	///
				1016	/// \param A
				1017	/// A 256-bit vector of [4 x double].
				1018	/// \param C
				1019	/// An immediate integer operand specifying how the values are to be copied.
				1020	/// Bit [0]:
				1021	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
				1022	/// returned vector.
				1023	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
				1024	/// returned vector.
				1025	/// Bit [1]:
				1026	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
				1027	/// returned vector.
				1028	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
				1029	/// returned vector.
				1030	/// Bit [2]:
				1031	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
				1032	/// returned vector.
				1033	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
				1034	/// returned vector.
				1035	/// Bit [3]:
				1036	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
				1037	/// returned vector.
				1038	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
				1039	/// returned vector.
				1040	/// \returns A 256-bit vector of [4 x double] containing the copied values.
				1041	#define _mm256_permute_pd(A, C) __extension__ ({ \
				1042	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
				1043	(__v4df)_mm256_undefined_pd(), \
				1044	0 + (((C) >> 0) & 0x1), \
				1045	0 + (((C) >> 1) & 0x1), \
				1046	2 + (((C) >> 2) & 0x1), \
				1047	2 + (((C) >> 3) & 0x1)); })
				1048
				1049	/// \brief Copies the values in a 128-bit vector of [4 x float] as
				1050	/// specified by the immediate integer operand.
				1051	///
				1052	/// \headerfile <x86intrin.h>
				1053	///
				1054	/// \code
				1055	/// __m128 _mm_permute_ps(__m128 A, const int C);
				1056	/// \endcode
				1057	///
				1058	/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
				1059	///
				1060	/// \param A
				1061	/// A 128-bit vector of [4 x float].
				1062	/// \param C
				1063	/// An immediate integer operand specifying how the values are to be copied.
				1064	/// Bits [1:0]:
				1065	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
				1066	/// returned vector.
				1067	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
				1068	/// returned vector.
				1069	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
				1070	/// returned vector.
				1071	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
				1072	/// returned vector.
				1073	/// Bits [3:2]:
				1074	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
				1075	/// returned vector.
				1076	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
				1077	/// returned vector.
				1078	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
				1079	/// returned vector.
				1080	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
				1081	/// returned vector.
				1082	/// Bits [5:4]:
				1083	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
				1084	/// returned vector.
				1085	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
				1086	/// returned vector.
				1087	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
				1088	/// returned vector.
				1089	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
				1090	/// returned vector.
				1091	/// Bits [7:6]:
				1092	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
				1093	/// returned vector.
				1094	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
				1095	/// returned vector.
				1096	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
				1097	/// returned vector.
				1098	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
				1099	/// returned vector.
				1100	/// \returns A 128-bit vector of [4 x float] containing the copied values.
				1101	#define _mm_permute_ps(A, C) __extension__ ({ \
				1102	(__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
				1103	(__v4sf)_mm_undefined_ps(), \
				1104	((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
				1105	((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
				1106
				1107	/// \brief Copies the values in a 256-bit vector of [8 x float] as
				1108	/// specified by the immediate integer operand.
				1109	///
				1110	/// \headerfile <x86intrin.h>
				1111	///
				1112	/// \code
				1113	/// __m256 _mm256_permute_ps(__m256 A, const int C);
				1114	/// \endcode
				1115	///
				1116	/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
				1117	///
				1118	/// \param A
				1119	/// A 256-bit vector of [8 x float].
				1120	/// \param C
				1121	/// An immediate integer operand specifying how the values are to be copied.
				1122	/// Bits [1:0]:
				1123	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
				1124	/// returned vector.
				1125	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
				1126	/// returned vector.
				1127	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
				1128	/// returned vector.
				1129	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
				1130	/// returned vector.
				1131	/// Bits [3:2]:
				1132	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
				1133	/// returned vector.
				1134	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
				1135	/// returned vector.
				1136	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
				1137	/// returned vector.
				1138	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
				1139	/// returned vector.
				1140	/// Bits [5:4]:
				1141	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
				1142	/// returned vector.
				1143	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
				1144	/// returned vector.
				1145	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
				1146	/// returned vector.
				1147	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
				1148	/// returned vector.
				1149	/// Bits [7:6]:
				1150	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
				1151	/// returned vector.
				1152	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
				1153	/// returned vector.
				1154	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
				1155	/// returned vector.
				1156	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
				1157	/// returned vector.
				1158	/// Bits [1:0]:
				1159	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
				1160	/// returned vector.
				1161	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
				1162	/// returned vector.
				1163	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
				1164	/// returned vector.
				1165	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
				1166	/// returned vector.
				1167	/// Bits [3:2]:
				1168	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
				1169	/// returned vector.
				1170	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
				1171	/// returned vector.
				1172	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
				1173	/// returned vector.
				1174	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
				1175	/// returned vector.
				1176	/// Bits [5:4]:
				1177	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
				1178	/// returned vector.
				1179	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
				1180	/// returned vector.
				1181	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
				1182	/// returned vector.
				1183	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
				1184	/// returned vector.
				1185	/// Bits [7:6]:
				1186	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
				1187	/// returned vector.
				1188	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
				1189	/// returned vector.
				1190	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
				1191	/// returned vector.
				1192	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
				1193	/// returned vector.
				1194	/// \returns A 256-bit vector of [8 x float] containing the copied values.
				1195	#define _mm256_permute_ps(A, C) __extension__ ({ \
				1196	(__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
				1197	(__v8sf)_mm256_undefined_ps(), \
				1198	0 + (((C) >> 0) & 0x3), \
				1199	0 + (((C) >> 2) & 0x3), \
				1200	0 + (((C) >> 4) & 0x3), \
				1201	0 + (((C) >> 6) & 0x3), \
				1202	4 + (((C) >> 0) & 0x3), \
				1203	4 + (((C) >> 2) & 0x3), \
				1204	4 + (((C) >> 4) & 0x3), \
				1205	4 + (((C) >> 6) & 0x3)); })
				1206
				1207	/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
				1208	/// [4 x double], as specified by the immediate integer operand.
				1209	///
				1210	/// \headerfile <x86intrin.h>
				1211	///
				1212	/// \code
				1213	/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
				1214	/// \endcode
				1215	///
				1216	/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
				1217	///
				1218	/// \param V1
				1219	/// A 256-bit vector of [4 x double].
				1220	/// \param V2
				1221	/// A 256-bit vector of [4 x double.
				1222	/// \param M
				1223	/// An immediate integer operand specifying how the values are to be
				1224	/// permuted.
				1225	/// Bits [1:0]:
				1226	/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
				1227	/// destination.
				1228	/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
				1229	/// destination.
				1230	/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
				1231	/// destination.
				1232	/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
				1233	/// destination.
				1234	/// Bits [5:4]:
				1235	/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
				1236	/// destination.
				1237	/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
				1238	/// destination.
				1239	/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
				1240	/// destination.
				1241	/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
				1242	/// destination.
				1243	/// \returns A 256-bit vector of [4 x double] containing the copied values.
				1244	#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
				1245	(__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
				1246	(__v4df)(__m256d)(V2), (M)); })
				1247
				1248	/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
				1249	/// [8 x float], as specified by the immediate integer operand.
				1250	///
				1251	/// \headerfile <x86intrin.h>
				1252	///
				1253	/// \code
				1254	/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
				1255	/// \endcode
				1256	///
				1257	/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
				1258	///
				1259	/// \param V1
				1260	/// A 256-bit vector of [8 x float].
				1261	/// \param V2
				1262	/// A 256-bit vector of [8 x float].
				1263	/// \param M
				1264	/// An immediate integer operand specifying how the values are to be
				1265	/// permuted.
				1266	/// Bits [1:0]:
				1267	/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
				1268	/// destination.
				1269	/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
				1270	/// destination.
				1271	/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
				1272	/// destination.
				1273	/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
				1274	/// destination.
				1275	/// Bits [5:4]:
				1276	/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
				1277	/// destination.
				1278	/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
				1279	/// destination.
				1280	/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
				1281	/// destination.
				1282	/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
				1283	/// destination.
				1284	/// \returns A 256-bit vector of [8 x float] containing the copied values.
				1285	#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
				1286	(__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
				1287	(__v8sf)(__m256)(V2), (M)); })
				1288
				1289	/// \brief Permutes 128-bit data values stored in two 256-bit integer vectors,
				1290	/// as specified by the immediate integer operand.
				1291	///
				1292	/// \headerfile <x86intrin.h>
				1293	///
				1294	/// \code
				1295	/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
				1296	/// \endcode
				1297	///
				1298	/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
				1299	///
				1300	/// \param V1
				1301	/// A 256-bit integer vector.
				1302	/// \param V2
				1303	/// A 256-bit integer vector.
				1304	/// \param M
				1305	/// An immediate integer operand specifying how the values are to be copied.
				1306	/// Bits [1:0]:
				1307	/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
				1308	/// destination.
				1309	/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
				1310	/// destination.
				1311	/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
				1312	/// destination.
				1313	/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
				1314	/// destination.
				1315	/// Bits [5:4]:
				1316	/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
				1317	/// destination.
				1318	/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
				1319	/// destination.
				1320	/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
				1321	/// destination.
				1322	/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
				1323	/// destination.
				1324	/// \returns A 256-bit integer vector containing the copied values.
				1325	#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
				1326	(__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
				1327	(__v8si)(__m256i)(V2), (M)); })
				1328
				1329	/* Vector Blend */
				1330	/// \brief Merges 64-bit double-precision data values stored in either of the
				1331	/// two 256-bit vectors of [4 x double], as specified by the immediate
				1332	/// integer operand.
				1333	///
				1334	/// \headerfile <x86intrin.h>
				1335	///
				1336	/// \code
				1337	/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
				1338	/// \endcode
				1339	///
				1340	/// This intrinsic corresponds to the \c VBLENDPD / BLENDPD instruction.
				1341	///
				1342	/// \param V1
				1343	/// A 256-bit vector of [4 x double].
				1344	/// \param V2
				1345	/// A 256-bit vector of [4 x double].
				1346	/// \param M
				1347	/// An immediate integer operand, with mask bits [3:0] specifying how the
				1348	/// values are to be copied. The position of the mask bit corresponds to the
				1349	/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
				1350	/// element in operand V1 is copied to the same position in the destination.
				1351	/// When a mask bit is 1, the corresponding 64-bit element in operand V2 is
				1352	/// copied to the same position in the destination.
				1353	/// \returns A 256-bit vector of [4 x double] containing the copied values.
				1354	#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
				1355	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
				1356	(__v4df)(__m256d)(V2), \
				1357	(((M) & 0x01) ? 4 : 0), \
				1358	(((M) & 0x02) ? 5 : 1), \
				1359	(((M) & 0x04) ? 6 : 2), \
				1360	(((M) & 0x08) ? 7 : 3)); })
				1361
				1362	/// \brief Merges 32-bit single-precision data values stored in either of the
				1363	/// two 256-bit vectors of [8 x float], as specified by the immediate
				1364	/// integer operand.
				1365	///
				1366	/// \headerfile <x86intrin.h>
				1367	///
				1368	/// \code
				1369	/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
				1370	/// \endcode
				1371	///
				1372	/// This intrinsic corresponds to the \c VBLENDPS / BLENDPS instruction.
				1373	///
				1374	/// \param V1
				1375	/// A 256-bit vector of [8 x float].
				1376	/// \param V2
				1377	/// A 256-bit vector of [8 x float].
				1378	/// \param M
				1379	/// An immediate integer operand, with mask bits [7:0] specifying how the
				1380	/// values are to be copied. The position of the mask bit corresponds to the
				1381	/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
				1382	/// element in operand V1 is copied to the same position in the destination.
				1383	/// When a mask bit is 1, the corresponding 32-bit element in operand V2 is
				1384	/// copied to the same position in the destination.
				1385	/// \returns A 256-bit vector of [8 x float] containing the copied values.
				1386	#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
				1387	(__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
				1388	(__v8sf)(__m256)(V2), \
				1389	(((M) & 0x01) ? 8 : 0), \
				1390	(((M) & 0x02) ? 9 : 1), \
				1391	(((M) & 0x04) ? 10 : 2), \
				1392	(((M) & 0x08) ? 11 : 3), \
				1393	(((M) & 0x10) ? 12 : 4), \
				1394	(((M) & 0x20) ? 13 : 5), \
				1395	(((M) & 0x40) ? 14 : 6), \
				1396	(((M) & 0x80) ? 15 : 7)); })
				1397
				1398	/// \brief Merges 64-bit double-precision data values stored in either of the
				1399	/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
				1400	/// operand.
				1401	///
				1402	/// \headerfile <x86intrin.h>
				1403	///
				1404	/// This intrinsic corresponds to the \c VBLENDVPD / BLENDVPD instruction.
				1405	///
				1406	/// \param __a
				1407	/// A 256-bit vector of [4 x double].
				1408	/// \param __b
				1409	/// A 256-bit vector of [4 x double].
				1410	/// \param __c
				1411	/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
				1412	/// how the values are to be copied. The position of the mask bit corresponds
				1413	/// to the most significant bit of a copied value. When a mask bit is 0, the
				1414	/// corresponding 64-bit element in operand __a is copied to the same
				1415	/// position in the destination. When a mask bit is 1, the corresponding
				1416	/// 64-bit element in operand __b is copied to the same position in the
				1417	/// destination.
				1418	/// \returns A 256-bit vector of [4 x double] containing the copied values.
				1419	static __inline __m256d __DEFAULT_FN_ATTRS
				1420	_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
				1421	{
				1422	return (__m256d)__builtin_ia32_blendvpd256(
				1423	(__v4df)__a, (__v4df)__b, (__v4df)__c);
				1424	}
				1425
				1426	/// \brief Merges 32-bit single-precision data values stored in either of the
				1427	/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
				1428	/// operand.
				1429	///
				1430	/// \headerfile <x86intrin.h>
				1431	///
				1432	/// This intrinsic corresponds to the \c VBLENDVPS / BLENDVPS instruction.
				1433	///
				1434	/// \param __a
				1435	/// A 256-bit vector of [8 x float].
				1436	/// \param __b
				1437	/// A 256-bit vector of [8 x float].
				1438	/// \param __c
				1439	/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
				1440	/// and 31 specifying how the values are to be copied. The position of the
				1441	/// mask bit corresponds to the most significant bit of a copied value. When
				1442	/// a mask bit is 0, the corresponding 32-bit element in operand __a is
				1443	/// copied to the same position in the destination. When a mask bit is 1, the
				1444	/// corresponding 32-bit element in operand __b is copied to the same
				1445	/// position in the destination.
				1446	/// \returns A 256-bit vector of [8 x float] containing the copied values.
				1447	static __inline __m256 __DEFAULT_FN_ATTRS
				1448	_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
				1449	{
				1450	return (__m256)__builtin_ia32_blendvps256(
				1451	(__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
				1452	}
				1453
				1454	/* Vector Dot Product */
				1455	/// \brief Computes two dot products in parallel, using the lower and upper
				1456	/// halves of two [8 x float] vectors as input to the two computations, and
				1457	/// returning the two dot products in the lower and upper halves of the
				1458	/// [8 x float] result. The immediate integer operand controls which
				1459	/// input elements will contribute to the dot product, and where the final
				1460	/// results are returned. In general, for each dot product, the four
				1461	/// corresponding elements of the input vectors are multiplied; the first
				1462	/// two and second two products are summed, then the two sums are added to
				1463	/// form the final result.
				1464	///
				1465	/// \headerfile <x86intrin.h>
				1466	///
				1467	/// \code
				1468	/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
				1469	/// \endcode
				1470	///
				1471	/// This intrinsic corresponds to the \c VDPPS / DPPS instruction.
				1472	///
				1473	/// \param V1
				1474	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
				1475	/// \param V2
				1476	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
				1477	/// \param M
				1478	/// An immediate integer argument. Bits [7:4] determine which elements of
				1479	/// the input vectors are used, with bit [4] corresponding to the lowest
				1480	/// element and bit [7] corresponding to the highest element of each [4 x
				1481	/// float] subvector. If a bit is set, the corresponding elements from the
				1482	/// two input vectors are used as an input for dot product; otherwise that
				1483	/// input is treated as zero. Bits [3:0] determine which elements of the
				1484	/// result will receive a copy of the final dot product, with bit [0]
				1485	/// corresponding to the lowest element and bit [3] corresponding to the
				1486	/// highest element of each [4 x float] subvector. If a bit is set, the dot
				1487	/// product is returned in the corresponding element; otherwise that element
				1488	/// is set to zero. The bitmask is applied in the same way to each of the
				1489	/// two parallel dot product computations.
				1490	/// \returns A 256-bit vector of [8 x float] containing the two dot products.
				1491	#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
				1492	(__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
				1493	(__v8sf)(__m256)(V2), (M)); })
				1494
				1495	/* Vector shuffle */
				1496	/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as
				1497	/// specified by the immediate value operand. The four selected elements in
				1498	/// each operand are copied to the destination according to the bits
				1499	/// specified in the immediate operand. The selected elements from the first
				1500	/// 256-bit operand are copied to bits [63:0] and bits [191:128] of the
				1501	/// destination, and the selected elements from the second 256-bit operand
				1502	/// are copied to bits [127:64] and bits [255:192] of the destination. For
				1503	/// example, if bits [7:0] of the immediate operand contain a value of 0xFF,
				1504	/// the 256-bit destination vector would contain the following values: b[7],
				1505	/// b[7], a[7], a[7], b[3], b[3], a[3], a[3].
				1506	///
				1507	/// \headerfile <x86intrin.h>
				1508	///
				1509	/// \code
				1510	/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
				1511	/// \endcode
				1512	///
				1513	/// This intrinsic corresponds to the \c VSHUFPS / SHUFPS instruction.
				1514	///
				1515	/// \param a
				1516	/// A 256-bit vector of [8 x float]. The four selected elements in this
				1517	/// operand are copied to bits [63:0] and bits [191:128] in the destination,
				1518	/// according to the bits specified in the immediate operand.
				1519	/// \param b
				1520	/// A 256-bit vector of [8 x float]. The four selected elements in this
				1521	/// operand are copied to bits [127:64] and bits [255:192] in the
				1522	/// destination, according to the bits specified in the immediate operand.
				1523	/// \param mask
				1524	/// An immediate value containing an 8-bit value specifying which elements to
				1525	/// copy from a and b. Bits [3:0] specify the values copied from operand a.
				1526	/// Bits [7:4] specify the values copied from operand b.
				1527	/// The destinations within the 256-bit destination are assigned values as
				1528	/// follows, according to the bit value assignments described below:
				1529	/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
				1530	/// destination.
				1531	/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
				1532	/// destination.
				1533	/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
				1534	/// destination.
				1535	/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
				1536	/// the destination.
				1537	/// Bit value assignments:
				1538	/// 00: Bits [31:0] and [159:128] are copied from the selected operand.
				1539	/// 01: Bits [63:32] and [191:160] are copied from the selected operand.
				1540	/// 10: Bits [95:64] and [223:192] are copied from the selected operand.
				1541	/// 11: Bits [127:96] and [255:224] are copied from the selected operand.
				1542	/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
				1543	#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
				1544	(__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
				1545	(__v8sf)(__m256)(b), \
				1546	0 + (((mask) >> 0) & 0x3), \
				1547	0 + (((mask) >> 2) & 0x3), \
				1548	8 + (((mask) >> 4) & 0x3), \
				1549	8 + (((mask) >> 6) & 0x3), \
				1550	4 + (((mask) >> 0) & 0x3), \
				1551	4 + (((mask) >> 2) & 0x3), \
				1552	12 + (((mask) >> 4) & 0x3), \
				1553	12 + (((mask) >> 6) & 0x3)); })
				1554
				1555	/// \brief Selects four double-precision values from the 256-bit operands of
				1556	/// [4 x double], as specified by the immediate value operand. The selected
				1557	/// elements from the first 256-bit operand are copied to bits [63:0] and
				1558	/// bits [191:128] in the destination, and the selected elements from the
				1559	/// second 256-bit operand are copied to bits [127:64] and bits [255:192] in
				1560	/// the destination. For example, if bits [3:0] of the immediate operand
				1561	/// contain a value of 0xF, the 256-bit destination vector would contain the
				1562	/// following values: b[3], a[3], b[1], a[1].
				1563	///
				1564	/// \headerfile <x86intrin.h>
				1565	///
				1566	/// \code
				1567	/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
				1568	/// \endcode
				1569	///
				1570	/// This intrinsic corresponds to the \c VSHUFPD / SHUFPD instruction.
				1571	///
				1572	/// \param a
				1573	/// A 256-bit vector of [4 x double].
				1574	/// \param b
				1575	/// A 256-bit vector of [4 x double].
				1576	/// \param mask
				1577	/// An immediate value containing 8-bit values specifying which elements to
				1578	/// copy from a and b:
				1579	/// Bit [0]=0: Bits [63:0] are copied from a to bits [63:0] of the
				1580	/// destination.
				1581	/// Bit [0]=1: Bits [127:64] are copied from a to bits [63:0] of the
				1582	/// destination.
				1583	/// Bit [1]=0: Bits [63:0] are copied from b to bits [127:64] of the
				1584	/// destination.
				1585	/// Bit [1]=1: Bits [127:64] are copied from b to bits [127:64] of the
				1586	/// destination.
				1587	/// Bit [2]=0: Bits [191:128] are copied from a to bits [191:128] of the
				1588	/// destination.
				1589	/// Bit [2]=1: Bits [255:192] are copied from a to bits [191:128] of the
				1590	/// destination.
				1591	/// Bit [3]=0: Bits [191:128] are copied from b to bits [255:192] of the
				1592	/// destination.
				1593	/// Bit [3]=1: Bits [255:192] are copied from b to bits [255:192] of the
				1594	/// destination.
				1595	/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
				1596	#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
				1597	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
				1598	(__v4df)(__m256d)(b), \
				1599	0 + (((mask) >> 0) & 0x1), \
				1600	4 + (((mask) >> 1) & 0x1), \
				1601	2 + (((mask) >> 2) & 0x1), \
				1602	6 + (((mask) >> 3) & 0x1)); })
				1603
				1604	/* Compare */
				1605	#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
				1606	#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
				1607	#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
				1608	#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
				1609	#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
				1610	#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
				1611	#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
				1612	#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
				1613	#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
				1614	#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
				1615	#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
				1616	#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
				1617	#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
				1618	#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
				1619	#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
				1620	#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
				1621	#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
				1622	#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
				1623	#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
				1624	#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
				1625	#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
				1626	#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
				1627	#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
				1628	#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
				1629	#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
				1630	#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
				1631	#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
				1632	#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
				1633	#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
				1634	#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
				1635	#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
				1636	#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
				1637
				1638	/// \brief Compares each of the corresponding double-precision values of two
				1639	/// 128-bit vectors of [2 x double], using the operation specified by the
				1640	/// immediate integer operand. Returns a [2 x double] vector consisting of
				1641	/// two doubles corresponding to the two comparison results: zero if the
				1642	/// comparison is false, and all 1's if the comparison is true.
				1643	///
				1644	/// \headerfile <x86intrin.h>
				1645	///
				1646	/// \code
				1647	/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
				1648	/// \endcode
				1649	///
				1650	/// This intrinsic corresponds to the \c VCMPPD / CMPPD instruction.
				1651	///
				1652	/// \param a
				1653	/// A 128-bit vector of [2 x double].
				1654	/// \param b
				1655	/// A 128-bit vector of [2 x double].
				1656	/// \param c
				1657	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1658	/// operation to use:
				1659	/// 00h, 08h, 10h, 18h: Equal
				1660	/// 01h, 09h, 11h, 19h: Less than
				1661	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1662	/// operands)
				1663	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1664	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1665	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1666	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1667	/// (swapped operands)
				1668	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1669	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
				1670	#define _mm_cmp_pd(a, b, c) __extension__ ({ \
				1671	(__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
				1672	(__v2df)(__m128d)(b), (c)); })
				1673
				1674	/// \brief Compares each of the corresponding values of two 128-bit vectors of
				1675	/// [4 x float], using the operation specified by the immediate integer
				1676	/// operand. Returns a [4 x float] vector consisting of four floats
				1677	/// corresponding to the four comparison results: zero if the comparison is
				1678	/// false, and all 1's if the comparison is true.
				1679	///
				1680	/// \headerfile <x86intrin.h>
				1681	///
				1682	/// \code
				1683	/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
				1684	/// \endcode
				1685	///
				1686	/// This intrinsic corresponds to the \c VCMPPS / CMPPS instruction.
				1687	///
				1688	/// \param a
				1689	/// A 128-bit vector of [4 x float].
				1690	/// \param b
				1691	/// A 128-bit vector of [4 x float].
				1692	/// \param c
				1693	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1694	/// operation to use:
				1695	/// 00h, 08h, 10h, 18h: Equal
				1696	/// 01h, 09h, 11h, 19h: Less than
				1697	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1698	/// operands)
				1699	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1700	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1701	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1702	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1703	/// (swapped operands)
				1704	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1705	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				1706	#define _mm_cmp_ps(a, b, c) __extension__ ({ \
				1707	(__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
				1708	(__v4sf)(__m128)(b), (c)); })
				1709
				1710	/// \brief Compares each of the corresponding double-precision values of two
				1711	/// 256-bit vectors of [4 x double], using the operation specified by the
				1712	/// immediate integer operand. Returns a [4 x double] vector consisting of
				1713	/// four doubles corresponding to the four comparison results: zero if the
				1714	/// comparison is false, and all 1's if the comparison is true.
				1715	///
				1716	/// \headerfile <x86intrin.h>
				1717	///
				1718	/// \code
				1719	/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
				1720	/// \endcode
				1721	///
				1722	/// This intrinsic corresponds to the \c VCMPPD / CMPPD instruction.
				1723	///
				1724	/// \param a
				1725	/// A 256-bit vector of [4 x double].
				1726	/// \param b
				1727	/// A 256-bit vector of [4 x double].
				1728	/// \param c
				1729	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1730	/// operation to use:
				1731	/// 00h, 08h, 10h, 18h: Equal
				1732	/// 01h, 09h, 11h, 19h: Less than
				1733	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1734	/// operands)
				1735	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1736	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1737	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1738	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1739	/// (swapped operands)
				1740	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1741	/// \returns A 256-bit vector of [4 x double] containing the comparison results.
				1742	#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
				1743	(__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
				1744	(__v4df)(__m256d)(b), (c)); })
				1745
				1746	/// \brief Compares each of the corresponding values of two 256-bit vectors of
				1747	/// [8 x float], using the operation specified by the immediate integer
				1748	/// operand. Returns a [8 x float] vector consisting of eight floats
				1749	/// corresponding to the eight comparison results: zero if the comparison is
				1750	/// false, and all 1's if the comparison is true.
				1751	///
				1752	/// \headerfile <x86intrin.h>
				1753	///
				1754	/// \code
				1755	/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
				1756	/// \endcode
				1757	///
				1758	/// This intrinsic corresponds to the \c VCMPPS / CMPPS instruction.
				1759	///
				1760	/// \param a
				1761	/// A 256-bit vector of [8 x float].
				1762	/// \param b
				1763	/// A 256-bit vector of [8 x float].
				1764	/// \param c
				1765	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1766	/// operation to use:
				1767	/// 00h, 08h, 10h, 18h: Equal
				1768	/// 01h, 09h, 11h, 19h: Less than
				1769	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1770	/// operands)
				1771	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1772	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1773	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1774	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1775	/// (swapped operands)
				1776	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1777	/// \returns A 256-bit vector of [8 x float] containing the comparison results.
				1778	#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
				1779	(__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
				1780	(__v8sf)(__m256)(b), (c)); })
				1781
				1782	/// \brief Compares each of the corresponding scalar double-precision values of
				1783	/// two 128-bit vectors of [2 x double], using the operation specified by the
				1784	/// immediate integer operand. If the result is true, all 64 bits of the
				1785	/// destination vector are set; otherwise they are cleared.
				1786	///
				1787	/// \headerfile <x86intrin.h>
				1788	///
				1789	/// \code
				1790	/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
				1791	/// \endcode
				1792	///
				1793	/// This intrinsic corresponds to the \c VCMPSD / CMPSD instruction.
				1794	///
				1795	/// \param a
				1796	/// A 128-bit vector of [2 x double].
				1797	/// \param b
				1798	/// A 128-bit vector of [2 x double].
				1799	/// \param c
				1800	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1801	/// operation to use:
				1802	/// 00h, 08h, 10h, 18h: Equal
				1803	/// 01h, 09h, 11h, 19h: Less than
				1804	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1805	/// operands)
				1806	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1807	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1808	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1809	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1810	/// (swapped operands)
				1811	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1812	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
				1813	#define _mm_cmp_sd(a, b, c) __extension__ ({ \
				1814	(__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
				1815	(__v2df)(__m128d)(b), (c)); })
				1816
				1817	/// \brief Compares each of the corresponding scalar values of two 128-bit
				1818	/// vectors of [4 x float], using the operation specified by the immediate
				1819	/// integer operand. If the result is true, all 32 bits of the destination
				1820	/// vector are set; otherwise they are cleared.
				1821	///
				1822	/// \headerfile <x86intrin.h>
				1823	///
				1824	/// \code
				1825	/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
				1826	/// \endcode
				1827	///
				1828	/// This intrinsic corresponds to the \c VCMPSS / CMPSS instruction.
				1829	///
				1830	/// \param a
				1831	/// A 128-bit vector of [4 x float].
				1832	/// \param b
				1833	/// A 128-bit vector of [4 x float].
				1834	/// \param c
				1835	/// An immediate integer operand, with bits [4:0] specifying which comparison
				1836	/// operation to use:
				1837	/// 00h, 08h, 10h, 18h: Equal
				1838	/// 01h, 09h, 11h, 19h: Less than
				1839	/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
				1840	/// operands)
				1841	/// 03h, 0Bh, 13h, 1Bh: Unordered
				1842	/// 04h, 0Ch, 14h, 1Ch: Not equal
				1843	/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
				1844	/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
				1845	/// (swapped operands)
				1846	/// 07h, 0Fh, 17h, 1Fh: Ordered
				1847	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
				1848	#define _mm_cmp_ss(a, b, c) __extension__ ({ \
				1849	(__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
				1850	(__v4sf)(__m128)(b), (c)); })
				1851
				1852	/// \brief Takes a [8 x i32] vector and returns the vector element value
				1853	/// indexed by the immediate constant operand.
				1854	///
				1855	/// \headerfile <x86intrin.h>
				1856	///
				1857	/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
				1858	/// EXTRACTF128+COMPOSITE instruction.
				1859	///
				1860	/// \param __a
				1861	/// A 256-bit vector of [8 x i32].
				1862	/// \param __imm
				1863	/// An immediate integer operand with bits [2:0] determining which vector
				1864	/// element is extracted and returned.
				1865	/// \returns A 32-bit integer containing the extracted 32 bits of extended
				1866	/// packed data.
				1867	static __inline int __DEFAULT_FN_ATTRS
				1868	_mm256_extract_epi32(__m256i __a, const int __imm)
				1869	{
				1870	__v8si __b = (__v8si)__a;
				1871	return __b[__imm & 7];
				1872	}
				1873
				1874	/// \brief Takes a [16 x i16] vector and returns the vector element value
				1875	/// indexed by the immediate constant operand.
				1876	///
				1877	/// \headerfile <x86intrin.h>
				1878	///
				1879	/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
				1880	/// EXTRACTF128+COMPOSITE instruction.
				1881	///
				1882	/// \param __a
				1883	/// A 256-bit integer vector of [16 x i16].
				1884	/// \param __imm
				1885	/// An immediate integer operand with bits [3:0] determining which vector
				1886	/// element is extracted and returned.
				1887	/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
				1888	/// packed data.
				1889	static __inline int __DEFAULT_FN_ATTRS
				1890	_mm256_extract_epi16(__m256i __a, const int __imm)
				1891	{
				1892	__v16hi __b = (__v16hi)__a;
				1893	return (unsigned short)__b[__imm & 15];
				1894	}
				1895
				1896	/// \brief Takes a [32 x i8] vector and returns the vector element value
				1897	/// indexed by the immediate constant operand.
				1898	///
				1899	/// \headerfile <x86intrin.h>
				1900	///
				1901	/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
				1902	/// EXTRACTF128+COMPOSITE instruction.
				1903	///
				1904	/// \param __a
				1905	/// A 256-bit integer vector of [32 x i8].
				1906	/// \param __imm
				1907	/// An immediate integer operand with bits [4:0] determining which vector
				1908	/// element is extracted and returned.
				1909	/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
				1910	/// packed data.
				1911	static __inline int __DEFAULT_FN_ATTRS
				1912	_mm256_extract_epi8(__m256i __a, const int __imm)
				1913	{
				1914	__v32qi __b = (__v32qi)__a;
				1915	return (unsigned char)__b[__imm & 31];
				1916	}
				1917
				1918	#ifdef __x86_64__
				1919	/// \brief Takes a [4 x i64] vector and returns the vector element value
				1920	/// indexed by the immediate constant operand.
				1921	///
				1922	/// \headerfile <x86intrin.h>
				1923	///
				1924	/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
				1925	/// EXTRACTF128+COMPOSITE instruction.
				1926	///
				1927	/// \param __a
				1928	/// A 256-bit integer vector of [4 x i64].
				1929	/// \param __imm
				1930	/// An immediate integer operand with bits [1:0] determining which vector
				1931	/// element is extracted and returned.
				1932	/// \returns A 64-bit integer containing the extracted 64 bits of extended
				1933	/// packed data.
				1934	static __inline long long __DEFAULT_FN_ATTRS
				1935	_mm256_extract_epi64(__m256i __a, const int __imm)
				1936	{
				1937	__v4di __b = (__v4di)__a;
				1938	return __b[__imm & 3];
				1939	}
				1940	#endif
				1941
				1942	/// \brief Takes a [8 x i32] vector and replaces the vector element value
				1943	/// indexed by the immediate constant operand by a new value. Returns the
				1944	/// modified vector.
				1945	///
				1946	/// \headerfile <x86intrin.h>
				1947	///
				1948	/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
				1949	/// INSERTF128+COMPOSITE instruction.
				1950	///
				1951	/// \param __a
				1952	/// A vector of [8 x i32] to be used by the insert operation.
				1953	/// \param __b
				1954	/// An integer value. The replacement value for the insert operation.
				1955	/// \param __imm
				1956	/// An immediate integer specifying the index of the vector element to be
				1957	/// replaced.
				1958	/// \returns A copy of vector __a, after replacing its element indexed by __imm
				1959	/// with __b.
				1960	static __inline __m256i __DEFAULT_FN_ATTRS
				1961	_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
				1962	{
				1963	__v8si __c = (__v8si)__a;
				1964	__c[__imm & 7] = __b;
				1965	return (__m256i)__c;
				1966	}
				1967
				1968
				1969	/// \brief Takes a [16 x i16] vector and replaces the vector element value
				1970	/// indexed by the immediate constant operand with a new value. Returns the
				1971	/// modified vector.
				1972	///
				1973	/// \headerfile <x86intrin.h>
				1974	///
				1975	/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
				1976	/// INSERTF128+COMPOSITE instruction.
				1977	///
				1978	/// \param __a
				1979	/// A vector of [16 x i16] to be used by the insert operation.
				1980	/// \param __b
				1981	/// An i16 integer value. The replacement value for the insert operation.
				1982	/// \param __imm
				1983	/// An immediate integer specifying the index of the vector element to be
				1984	/// replaced.
				1985	/// \returns A copy of vector __a, after replacing its element indexed by __imm
				1986	/// with __b.
				1987	static __inline __m256i __DEFAULT_FN_ATTRS
				1988	_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
				1989	{
				1990	__v16hi __c = (__v16hi)__a;
				1991	__c[__imm & 15] = __b;
				1992	return (__m256i)__c;
				1993	}
				1994
				1995	/// \brief Takes a [32 x i8] vector and replaces the vector element value
				1996	/// indexed by the immediate constant operand with a new value. Returns the
				1997	/// modified vector.
				1998	///
				1999	/// \headerfile <x86intrin.h>
				2000	///
				2001	/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
				2002	/// INSERTF128+COMPOSITE instruction.
				2003	///
				2004	/// \param __a
				2005	/// A vector of [32 x i8] to be used by the insert operation.
				2006	/// \param __b
				2007	/// An i8 integer value. The replacement value for the insert operation.
				2008	/// \param __imm
				2009	/// An immediate integer specifying the index of the vector element to be
				2010	/// replaced.
				2011	/// \returns A copy of vector __a, after replacing its element indexed by __imm
				2012	/// with __b.
				2013	static __inline __m256i __DEFAULT_FN_ATTRS
				2014	_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
				2015	{
				2016	__v32qi __c = (__v32qi)__a;
				2017	__c[__imm & 31] = __b;
				2018	return (__m256i)__c;
				2019	}
				2020
				2021	#ifdef __x86_64__
				2022	/// \brief Takes a [4 x i64] vector and replaces the vector element value
				2023	/// indexed by the immediate constant operand with a new value. Returns the
				2024	/// modified vector.
				2025	///
				2026	/// \headerfile <x86intrin.h>
				2027	///
				2028	/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
				2029	/// INSERTF128+COMPOSITE instruction.
				2030	///
				2031	/// \param __a
				2032	/// A vector of [4 x i64] to be used by the insert operation.
				2033	/// \param __b
				2034	/// A 64-bit integer value. The replacement value for the insert operation.
				2035	/// \param __imm
				2036	/// An immediate integer specifying the index of the vector element to be
				2037	/// replaced.
				2038	/// \returns A copy of vector __a, after replacing its element indexed by __imm
				2039	/// with __b.
				2040	static __inline __m256i __DEFAULT_FN_ATTRS
				2041	_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
				2042	{
				2043	__v4di __c = (__v4di)__a;
				2044	__c[__imm & 3] = __b;
				2045	return (__m256i)__c;
				2046	}
				2047	#endif
				2048
				2049	/* Conversion */
				2050	/// \brief Converts a vector of [4 x i32] into a vector of [4 x double].
				2051	///
				2052	/// \headerfile <x86intrin.h>
				2053	///
				2054	/// This intrinsic corresponds to the \c VCVTDQ2PD / CVTDQ2PD instruction.
				2055	///
				2056	/// \param __a
				2057	/// A 128-bit integer vector of [4 x i32].
				2058	/// \returns A 256-bit vector of [4 x double] containing the converted values.
				2059	static __inline __m256d __DEFAULT_FN_ATTRS
				2060	_mm256_cvtepi32_pd(__m128i __a)
				2061	{
				2062	return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
				2063	}
				2064
				2065	/// \brief Converts a vector of [8 x i32] into a vector of [8 x float].
				2066	///
				2067	/// \headerfile <x86intrin.h>
				2068	///
				2069	/// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction.
				2070	///
				2071	/// \param __a
				2072	/// A 256-bit integer vector.
				2073	/// \returns A 256-bit vector of [8 x float] containing the converted values.
				2074	static __inline __m256 __DEFAULT_FN_ATTRS
				2075	_mm256_cvtepi32_ps(__m256i __a)
				2076	{
				2077	return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
				2078	}
				2079
				2080	/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of
				2081	/// [4 x float].
				2082	///
				2083	/// \headerfile <x86intrin.h>
				2084	///
				2085	/// This intrinsic corresponds to the \c VCVTPD2PS / CVTPD2PS instruction.
				2086	///
				2087	/// \param __a
				2088	/// A 256-bit vector of [4 x double].
				2089	/// \returns A 128-bit vector of [4 x float] containing the converted values.
				2090	static __inline __m128 __DEFAULT_FN_ATTRS
				2091	_mm256_cvtpd_ps(__m256d __a)
				2092	{
				2093	return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
				2094	}
				2095
				2096	/// \brief Converts a vector of [8 x float] into a vector of [8 x i32].
				2097	///
				2098	/// \headerfile <x86intrin.h>
				2099	///
				2100	/// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction.
				2101	///
				2102	/// \param __a
				2103	/// A 256-bit vector of [8 x float].
				2104	/// \returns A 256-bit integer vector containing the converted values.
				2105	static __inline __m256i __DEFAULT_FN_ATTRS
				2106	_mm256_cvtps_epi32(__m256 __a)
				2107	{
				2108	return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
				2109	}
				2110
				2111	static __inline __m256d __DEFAULT_FN_ATTRS
				2112	_mm256_cvtps_pd(__m128 __a)
				2113	{
				2114	return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
				2115	}
				2116
				2117	static __inline __m128i __DEFAULT_FN_ATTRS
				2118	_mm256_cvttpd_epi32(__m256d __a)
				2119	{
				2120	return (__m128i)__builtin_convertvector((__v4df) __a, __v4si);
				2121	}
				2122
				2123	static __inline __m128i __DEFAULT_FN_ATTRS
				2124	_mm256_cvtpd_epi32(__m256d __a)
				2125	{
				2126	return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
				2127	}
				2128
				2129	static __inline __m256i __DEFAULT_FN_ATTRS
				2130	_mm256_cvttps_epi32(__m256 __a)
				2131	{
				2132	return (__m256i)__builtin_convertvector((__v8sf) __a, __v8si);
				2133	}
				2134
				2135	static __inline double __DEFAULT_FN_ATTRS
				2136	_mm256_cvtsd_f64(__m256d __a)
				2137	{
				2138	return __a[0];
				2139	}
				2140
				2141	static __inline int __DEFAULT_FN_ATTRS
				2142	_mm256_cvtsi256_si32(__m256i __a)
				2143	{
				2144	__v8si __b = (__v8si)__a;
				2145	return __b[0];
				2146	}
				2147
				2148	static __inline float __DEFAULT_FN_ATTRS
				2149	_mm256_cvtss_f32(__m256 __a)
				2150	{
				2151	return __a[0];
				2152	}
				2153
				2154	/* Vector replicate */
				2155	static __inline __m256 __DEFAULT_FN_ATTRS
				2156	_mm256_movehdup_ps(__m256 __a)
				2157	{
				2158	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
				2159	}
				2160
				2161	static __inline __m256 __DEFAULT_FN_ATTRS
				2162	_mm256_moveldup_ps(__m256 __a)
				2163	{
				2164	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
				2165	}
				2166
				2167	static __inline __m256d __DEFAULT_FN_ATTRS
				2168	_mm256_movedup_pd(__m256d __a)
				2169	{
				2170	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
				2171	}
				2172
				2173	/* Unpack and Interleave */
				2174	static __inline __m256d __DEFAULT_FN_ATTRS
				2175	_mm256_unpackhi_pd(__m256d __a, __m256d __b)
				2176	{
				2177	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
				2178	}
				2179
				2180	static __inline __m256d __DEFAULT_FN_ATTRS
				2181	_mm256_unpacklo_pd(__m256d __a, __m256d __b)
				2182	{
				2183	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
				2184	}
				2185
				2186	static __inline __m256 __DEFAULT_FN_ATTRS
				2187	_mm256_unpackhi_ps(__m256 __a, __m256 __b)
				2188	{
				2189	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
				2190	}
				2191
				2192	static __inline __m256 __DEFAULT_FN_ATTRS
				2193	_mm256_unpacklo_ps(__m256 __a, __m256 __b)
				2194	{
				2195	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
				2196	}
				2197
				2198	/* Bit Test */
				2199	static __inline int __DEFAULT_FN_ATTRS
				2200	_mm_testz_pd(__m128d __a, __m128d __b)
				2201	{
				2202	return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
				2203	}
				2204
				2205	static __inline int __DEFAULT_FN_ATTRS
				2206	_mm_testc_pd(__m128d __a, __m128d __b)
				2207	{
				2208	return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
				2209	}
				2210
				2211	static __inline int __DEFAULT_FN_ATTRS
				2212	_mm_testnzc_pd(__m128d __a, __m128d __b)
				2213	{
				2214	return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
				2215	}
				2216
				2217	static __inline int __DEFAULT_FN_ATTRS
				2218	_mm_testz_ps(__m128 __a, __m128 __b)
				2219	{
				2220	return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
				2221	}
				2222
				2223	static __inline int __DEFAULT_FN_ATTRS
				2224	_mm_testc_ps(__m128 __a, __m128 __b)
				2225	{
				2226	return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
				2227	}
				2228
				2229	static __inline int __DEFAULT_FN_ATTRS
				2230	_mm_testnzc_ps(__m128 __a, __m128 __b)
				2231	{
				2232	return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
				2233	}
				2234
				2235	static __inline int __DEFAULT_FN_ATTRS
				2236	_mm256_testz_pd(__m256d __a, __m256d __b)
				2237	{
				2238	return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
				2239	}
				2240
				2241	static __inline int __DEFAULT_FN_ATTRS
				2242	_mm256_testc_pd(__m256d __a, __m256d __b)
				2243	{
				2244	return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
				2245	}
				2246
				2247	static __inline int __DEFAULT_FN_ATTRS
				2248	_mm256_testnzc_pd(__m256d __a, __m256d __b)
				2249	{
				2250	return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
				2251	}
				2252
				2253	static __inline int __DEFAULT_FN_ATTRS
				2254	_mm256_testz_ps(__m256 __a, __m256 __b)
				2255	{
				2256	return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
				2257	}
				2258
				2259	static __inline int __DEFAULT_FN_ATTRS
				2260	_mm256_testc_ps(__m256 __a, __m256 __b)
				2261	{
				2262	return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
				2263	}
				2264
				2265	static __inline int __DEFAULT_FN_ATTRS
				2266	_mm256_testnzc_ps(__m256 __a, __m256 __b)
				2267	{
				2268	return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
				2269	}
				2270
				2271	static __inline int __DEFAULT_FN_ATTRS
				2272	_mm256_testz_si256(__m256i __a, __m256i __b)
				2273	{
				2274	return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
				2275	}
				2276
				2277	static __inline int __DEFAULT_FN_ATTRS
				2278	_mm256_testc_si256(__m256i __a, __m256i __b)
				2279	{
				2280	return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
				2281	}
				2282
				2283	static __inline int __DEFAULT_FN_ATTRS
				2284	_mm256_testnzc_si256(__m256i __a, __m256i __b)
				2285	{
				2286	return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
				2287	}
				2288
				2289	/* Vector extract sign mask */
				2290	static __inline int __DEFAULT_FN_ATTRS
				2291	_mm256_movemask_pd(__m256d __a)
				2292	{
				2293	return __builtin_ia32_movmskpd256((__v4df)__a);
				2294	}
				2295
				2296	static __inline int __DEFAULT_FN_ATTRS
				2297	_mm256_movemask_ps(__m256 __a)
				2298	{
				2299	return __builtin_ia32_movmskps256((__v8sf)__a);
				2300	}
				2301
				2302	/* Vector __zero */
				2303	static __inline void __DEFAULT_FN_ATTRS
				2304	_mm256_zeroall(void)
				2305	{
				2306	__builtin_ia32_vzeroall();
				2307	}
				2308
				2309	static __inline void __DEFAULT_FN_ATTRS
				2310	_mm256_zeroupper(void)
				2311	{
				2312	__builtin_ia32_vzeroupper();
				2313	}
				2314
				2315	/* Vector load with broadcast */
				2316	static __inline __m128 __DEFAULT_FN_ATTRS
				2317	_mm_broadcast_ss(float const *__a)
				2318	{
				2319	float __f = *__a;
				2320	return (__m128)(__v4sf){ __f, __f, __f, __f };
				2321	}
				2322
				2323	static __inline __m256d __DEFAULT_FN_ATTRS
				2324	_mm256_broadcast_sd(double const *__a)
				2325	{
				2326	double __d = *__a;
				2327	return (__m256d)(__v4df){ __d, __d, __d, __d };
				2328	}
				2329
				2330	static __inline __m256 __DEFAULT_FN_ATTRS
				2331	_mm256_broadcast_ss(float const *__a)
				2332	{
				2333	float __f = *__a;
				2334	return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
				2335	}
				2336
				2337	static __inline __m256d __DEFAULT_FN_ATTRS
				2338	_mm256_broadcast_pd(__m128d const *__a)
				2339	{
				2340	return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a);
				2341	}
				2342
				2343	static __inline __m256 __DEFAULT_FN_ATTRS
				2344	_mm256_broadcast_ps(__m128 const *__a)
				2345	{
				2346	return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a);
				2347	}
				2348
				2349	/* SIMD load ops */
				2350	static __inline __m256d __DEFAULT_FN_ATTRS
				2351	_mm256_load_pd(double const *__p)
				2352	{
				2353	return (__m256d )__p;
				2354	}
				2355
				2356	static __inline __m256 __DEFAULT_FN_ATTRS
				2357	_mm256_load_ps(float const *__p)
				2358	{
				2359	return (__m256 )__p;
				2360	}
				2361
				2362	static __inline __m256d __DEFAULT_FN_ATTRS
				2363	_mm256_loadu_pd(double const *__p)
				2364	{
				2365	struct __loadu_pd {
				2366	__m256d __v;
				2367	} __attribute__((__packed__, __may_alias__));
				2368	return ((struct __loadu_pd*)__p)->__v;
				2369	}
				2370
				2371	static __inline __m256 __DEFAULT_FN_ATTRS
				2372	_mm256_loadu_ps(float const *__p)
				2373	{
				2374	struct __loadu_ps {
				2375	__m256 __v;
				2376	} __attribute__((__packed__, __may_alias__));
				2377	return ((struct __loadu_ps*)__p)->__v;
				2378	}
				2379
				2380	static __inline __m256i __DEFAULT_FN_ATTRS
				2381	_mm256_load_si256(__m256i const *__p)
				2382	{
				2383	return *__p;
				2384	}
				2385
				2386	static __inline __m256i __DEFAULT_FN_ATTRS
				2387	_mm256_loadu_si256(__m256i const *__p)
				2388	{
				2389	struct __loadu_si256 {
				2390	__m256i __v;
				2391	} __attribute__((__packed__, __may_alias__));
				2392	return ((struct __loadu_si256*)__p)->__v;
				2393	}
				2394
				2395	static __inline __m256i __DEFAULT_FN_ATTRS
				2396	_mm256_lddqu_si256(__m256i const *__p)
				2397	{
				2398	return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
				2399	}
				2400
				2401	/* SIMD store ops */
				2402	static __inline void __DEFAULT_FN_ATTRS
				2403	_mm256_store_pd(double *__p, __m256d __a)
				2404	{
				2405	(__m256d )__p = __a;
				2406	}
				2407
				2408	static __inline void __DEFAULT_FN_ATTRS
				2409	_mm256_store_ps(float *__p, __m256 __a)
				2410	{
				2411	(__m256 )__p = __a;
				2412	}
				2413
				2414	static __inline void __DEFAULT_FN_ATTRS
				2415	_mm256_storeu_pd(double *__p, __m256d __a)
				2416	{
				2417	struct __storeu_pd {
				2418	__m256d __v;
				2419	} __attribute__((__packed__, __may_alias__));
				2420	((struct __storeu_pd*)__p)->__v = __a;
				2421	}
				2422
				2423	static __inline void __DEFAULT_FN_ATTRS
				2424	_mm256_storeu_ps(float *__p, __m256 __a)
				2425	{
				2426	struct __storeu_ps {
				2427	__m256 __v;
				2428	} __attribute__((__packed__, __may_alias__));
				2429	((struct __storeu_ps*)__p)->__v = __a;
				2430	}
				2431
				2432	static __inline void __DEFAULT_FN_ATTRS
				2433	_mm256_store_si256(__m256i *__p, __m256i __a)
				2434	{
				2435	*__p = __a;
				2436	}
				2437
				2438	static __inline void __DEFAULT_FN_ATTRS
				2439	_mm256_storeu_si256(__m256i *__p, __m256i __a)
				2440	{
				2441	struct __storeu_si256 {
				2442	__m256i __v;
				2443	} __attribute__((__packed__, __may_alias__));
				2444	((struct __storeu_si256*)__p)->__v = __a;
				2445	}
				2446
				2447	/* Conditional load ops */
				2448	static __inline __m128d __DEFAULT_FN_ATTRS
				2449	_mm_maskload_pd(double const *__p, __m128i __m)
				2450	{
				2451	return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
				2452	}
				2453
				2454	static __inline __m256d __DEFAULT_FN_ATTRS
				2455	_mm256_maskload_pd(double const *__p, __m256i __m)
				2456	{
				2457	return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
				2458	(__v4di)__m);
				2459	}
				2460
				2461	static __inline __m128 __DEFAULT_FN_ATTRS
				2462	_mm_maskload_ps(float const *__p, __m128i __m)
				2463	{
				2464	return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
				2465	}
				2466
				2467	static __inline __m256 __DEFAULT_FN_ATTRS
				2468	_mm256_maskload_ps(float const *__p, __m256i __m)
				2469	{
				2470	return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
				2471	}
				2472
				2473	/* Conditional store ops */
				2474	static __inline void __DEFAULT_FN_ATTRS
				2475	_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
				2476	{
				2477	__builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
				2478	}
				2479
				2480	static __inline void __DEFAULT_FN_ATTRS
				2481	_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
				2482	{
				2483	__builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
				2484	}
				2485
				2486	static __inline void __DEFAULT_FN_ATTRS
				2487	_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
				2488	{
				2489	__builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
				2490	}
				2491
				2492	static __inline void __DEFAULT_FN_ATTRS
				2493	_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
				2494	{
				2495	__builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
				2496	}
				2497
				2498	/* Cacheability support ops */
				2499	static __inline void __DEFAULT_FN_ATTRS
				2500	_mm256_stream_si256(__m256i *__a, __m256i __b)
				2501	{
				2502	__builtin_nontemporal_store((__v4di)__b, (__v4di*)__a);
				2503	}
				2504
				2505	static __inline void __DEFAULT_FN_ATTRS
				2506	_mm256_stream_pd(double *__a, __m256d __b)
				2507	{
				2508	__builtin_nontemporal_store((__v4df)__b, (__v4df*)__a);
				2509	}
				2510
				2511	static __inline void __DEFAULT_FN_ATTRS
				2512	_mm256_stream_ps(float *__p, __m256 __a)
				2513	{
				2514	__builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p);
				2515	}
				2516
				2517	/* Create vectors */
				2518	static __inline__ __m256d __DEFAULT_FN_ATTRS
				2519	_mm256_undefined_pd(void)
				2520	{
				2521	return (__m256d)__builtin_ia32_undef256();
				2522	}
				2523
				2524	static __inline__ __m256 __DEFAULT_FN_ATTRS
				2525	_mm256_undefined_ps(void)
				2526	{
				2527	return (__m256)__builtin_ia32_undef256();
				2528	}
				2529
				2530	static __inline__ __m256i __DEFAULT_FN_ATTRS
				2531	_mm256_undefined_si256(void)
				2532	{
				2533	return (__m256i)__builtin_ia32_undef256();
				2534	}
				2535
				2536	static __inline __m256d __DEFAULT_FN_ATTRS
				2537	_mm256_set_pd(double __a, double __b, double __c, double __d)
				2538	{
				2539	return (__m256d){ __d, __c, __b, __a };
				2540	}
				2541
				2542	static __inline __m256 __DEFAULT_FN_ATTRS
				2543	_mm256_set_ps(float __a, float __b, float __c, float __d,
				2544	float __e, float __f, float __g, float __h)
				2545	{
				2546	return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
				2547	}
				2548
				2549	static __inline __m256i __DEFAULT_FN_ATTRS
				2550	_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
				2551	int __i4, int __i5, int __i6, int __i7)
				2552	{
				2553	return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
				2554	}
				2555
				2556	static __inline __m256i __DEFAULT_FN_ATTRS
				2557	_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
				2558	short __w11, short __w10, short __w09, short __w08,
				2559	short __w07, short __w06, short __w05, short __w04,
				2560	short __w03, short __w02, short __w01, short __w00)
				2561	{
				2562	return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
				2563	__w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
				2564	}
				2565
				2566	static __inline __m256i __DEFAULT_FN_ATTRS
				2567	_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
				2568	char __b27, char __b26, char __b25, char __b24,
				2569	char __b23, char __b22, char __b21, char __b20,
				2570	char __b19, char __b18, char __b17, char __b16,
				2571	char __b15, char __b14, char __b13, char __b12,
				2572	char __b11, char __b10, char __b09, char __b08,
				2573	char __b07, char __b06, char __b05, char __b04,
				2574	char __b03, char __b02, char __b01, char __b00)
				2575	{
				2576	return (__m256i)(__v32qi){
				2577	__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
				2578	__b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
				2579	__b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
				2580	__b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
				2581	};
				2582	}
				2583
				2584	static __inline __m256i __DEFAULT_FN_ATTRS
				2585	_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
				2586	{
				2587	return (__m256i)(__v4di){ __d, __c, __b, __a };
				2588	}
				2589
				2590	/* Create vectors with elements in reverse order */
				2591	static __inline __m256d __DEFAULT_FN_ATTRS
				2592	_mm256_setr_pd(double __a, double __b, double __c, double __d)
				2593	{
				2594	return (__m256d){ __a, __b, __c, __d };
				2595	}
				2596
				2597	static __inline __m256 __DEFAULT_FN_ATTRS
				2598	_mm256_setr_ps(float __a, float __b, float __c, float __d,
				2599	float __e, float __f, float __g, float __h)
				2600	{
				2601	return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
				2602	}
				2603
				2604	static __inline __m256i __DEFAULT_FN_ATTRS
				2605	_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
				2606	int __i4, int __i5, int __i6, int __i7)
				2607	{
				2608	return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
				2609	}
				2610
				2611	static __inline __m256i __DEFAULT_FN_ATTRS
				2612	_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
				2613	short __w11, short __w10, short __w09, short __w08,
				2614	short __w07, short __w06, short __w05, short __w04,
				2615	short __w03, short __w02, short __w01, short __w00)
				2616	{
				2617	return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
				2618	__w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
				2619	}
				2620
				2621	static __inline __m256i __DEFAULT_FN_ATTRS
				2622	_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
				2623	char __b27, char __b26, char __b25, char __b24,
				2624	char __b23, char __b22, char __b21, char __b20,
				2625	char __b19, char __b18, char __b17, char __b16,
				2626	char __b15, char __b14, char __b13, char __b12,
				2627	char __b11, char __b10, char __b09, char __b08,
				2628	char __b07, char __b06, char __b05, char __b04,
				2629	char __b03, char __b02, char __b01, char __b00)
				2630	{
				2631	return (__m256i)(__v32qi){
				2632	__b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
				2633	__b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
				2634	__b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
				2635	__b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
				2636	}
				2637
				2638	static __inline __m256i __DEFAULT_FN_ATTRS
				2639	_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
				2640	{
				2641	return (__m256i)(__v4di){ __a, __b, __c, __d };
				2642	}
				2643
				2644	/* Create vectors with repeated elements */
				2645	static __inline __m256d __DEFAULT_FN_ATTRS
				2646	_mm256_set1_pd(double __w)
				2647	{
				2648	return (__m256d){ __w, __w, __w, __w };
				2649	}
				2650
				2651	static __inline __m256 __DEFAULT_FN_ATTRS
				2652	_mm256_set1_ps(float __w)
				2653	{
				2654	return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
				2655	}
				2656
				2657	static __inline __m256i __DEFAULT_FN_ATTRS
				2658	_mm256_set1_epi32(int __i)
				2659	{
				2660	return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
				2661	}
				2662
				2663	static __inline __m256i __DEFAULT_FN_ATTRS
				2664	_mm256_set1_epi16(short __w)
				2665	{
				2666	return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
				2667	__w, __w, __w, __w, __w, __w };
				2668	}
				2669
				2670	static __inline __m256i __DEFAULT_FN_ATTRS
				2671	_mm256_set1_epi8(char __b)
				2672	{
				2673	return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
				2674	__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
				2675	__b, __b, __b, __b, __b, __b, __b };
				2676	}
				2677
				2678	static __inline __m256i __DEFAULT_FN_ATTRS
				2679	_mm256_set1_epi64x(long long __q)
				2680	{
				2681	return (__m256i)(__v4di){ __q, __q, __q, __q };
				2682	}
				2683
				2684	/* Create __zeroed vectors */
				2685	static __inline __m256d __DEFAULT_FN_ATTRS
				2686	_mm256_setzero_pd(void)
				2687	{
				2688	return (__m256d){ 0, 0, 0, 0 };
				2689	}
				2690
				2691	static __inline __m256 __DEFAULT_FN_ATTRS
				2692	_mm256_setzero_ps(void)
				2693	{
				2694	return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
				2695	}
				2696
				2697	static __inline __m256i __DEFAULT_FN_ATTRS
				2698	_mm256_setzero_si256(void)
				2699	{
				2700	return (__m256i){ 0LL, 0LL, 0LL, 0LL };
				2701	}
				2702
				2703	/* Cast between vector types */
				2704	static __inline __m256 __DEFAULT_FN_ATTRS
				2705	_mm256_castpd_ps(__m256d __a)
				2706	{
				2707	return (__m256)__a;
				2708	}
				2709
				2710	static __inline __m256i __DEFAULT_FN_ATTRS
				2711	_mm256_castpd_si256(__m256d __a)
				2712	{
				2713	return (__m256i)__a;
				2714	}
				2715
				2716	static __inline __m256d __DEFAULT_FN_ATTRS
				2717	_mm256_castps_pd(__m256 __a)
				2718	{
				2719	return (__m256d)__a;
				2720	}
				2721
				2722	static __inline __m256i __DEFAULT_FN_ATTRS
				2723	_mm256_castps_si256(__m256 __a)
				2724	{
				2725	return (__m256i)__a;
				2726	}
				2727
				2728	static __inline __m256 __DEFAULT_FN_ATTRS
				2729	_mm256_castsi256_ps(__m256i __a)
				2730	{
				2731	return (__m256)__a;
				2732	}
				2733
				2734	static __inline __m256d __DEFAULT_FN_ATTRS
				2735	_mm256_castsi256_pd(__m256i __a)
				2736	{
				2737	return (__m256d)__a;
				2738	}
				2739
				2740	static __inline __m128d __DEFAULT_FN_ATTRS
				2741	_mm256_castpd256_pd128(__m256d __a)
				2742	{
				2743	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
				2744	}
				2745
				2746	static __inline __m128 __DEFAULT_FN_ATTRS
				2747	_mm256_castps256_ps128(__m256 __a)
				2748	{
				2749	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
				2750	}
				2751
				2752	static __inline __m128i __DEFAULT_FN_ATTRS
				2753	_mm256_castsi256_si128(__m256i __a)
				2754	{
				2755	return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
				2756	}
				2757
				2758	static __inline __m256d __DEFAULT_FN_ATTRS
				2759	_mm256_castpd128_pd256(__m128d __a)
				2760	{
				2761	return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
				2762	}
				2763
				2764	static __inline __m256 __DEFAULT_FN_ATTRS
				2765	_mm256_castps128_ps256(__m128 __a)
				2766	{
				2767	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
				2768	}
				2769
				2770	static __inline __m256i __DEFAULT_FN_ATTRS
				2771	_mm256_castsi128_si256(__m128i __a)
				2772	{
				2773	return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
				2774	}
				2775
				2776	/*
				2777	Vector insert.
				2778	We use macros rather than inlines because we only want to accept
				2779	invocations where the immediate M is a constant expression.
				2780	*/
				2781	#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
				2782	(__m256)__builtin_shufflevector( \
				2783	(__v8sf)(__m256)(V1), \
				2784	(__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
				2785	(((M) & 1) ? 0 : 8), \
				2786	(((M) & 1) ? 1 : 9), \
				2787	(((M) & 1) ? 2 : 10), \
				2788	(((M) & 1) ? 3 : 11), \
				2789	(((M) & 1) ? 8 : 4), \
				2790	(((M) & 1) ? 9 : 5), \
				2791	(((M) & 1) ? 10 : 6), \
				2792	(((M) & 1) ? 11 : 7) );})
				2793
				2794	#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
				2795	(__m256d)__builtin_shufflevector( \
				2796	(__v4df)(__m256d)(V1), \
				2797	(__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
				2798	(((M) & 1) ? 0 : 4), \
				2799	(((M) & 1) ? 1 : 5), \
				2800	(((M) & 1) ? 4 : 2), \
				2801	(((M) & 1) ? 5 : 3) );})
				2802
				2803	#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
				2804	(__m256i)__builtin_shufflevector( \
				2805	(__v4di)(__m256i)(V1), \
				2806	(__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
				2807	(((M) & 1) ? 0 : 4), \
				2808	(((M) & 1) ? 1 : 5), \
				2809	(((M) & 1) ? 4 : 2), \
				2810	(((M) & 1) ? 5 : 3) );})
				2811
				2812	/*
				2813	Vector extract.
				2814	We use macros rather than inlines because we only want to accept
				2815	invocations where the immediate M is a constant expression.
				2816	*/
				2817	#define _mm256_extractf128_ps(V, M) __extension__ ({ \
				2818	(__m128)__builtin_shufflevector( \
				2819	(__v8sf)(__m256)(V), \
				2820	(__v8sf)(_mm256_undefined_ps()), \
				2821	(((M) & 1) ? 4 : 0), \
				2822	(((M) & 1) ? 5 : 1), \
				2823	(((M) & 1) ? 6 : 2), \
				2824	(((M) & 1) ? 7 : 3) );})
				2825
				2826	#define _mm256_extractf128_pd(V, M) __extension__ ({ \
				2827	(__m128d)__builtin_shufflevector( \
				2828	(__v4df)(__m256d)(V), \
				2829	(__v4df)(_mm256_undefined_pd()), \
				2830	(((M) & 1) ? 2 : 0), \
				2831	(((M) & 1) ? 3 : 1) );})
				2832
				2833	#define _mm256_extractf128_si256(V, M) __extension__ ({ \
				2834	(__m128i)__builtin_shufflevector( \
				2835	(__v4di)(__m256i)(V), \
				2836	(__v4di)(_mm256_undefined_si256()), \
				2837	(((M) & 1) ? 2 : 0), \
				2838	(((M) & 1) ? 3 : 1) );})
				2839
				2840	/* SIMD load ops (unaligned) */
				2841	static __inline __m256 __DEFAULT_FN_ATTRS
				2842	_mm256_loadu2_m128(float const __addr_hi, float const __addr_lo)
				2843	{
				2844	__m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
				2845	return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
				2846	}
				2847
				2848	static __inline __m256d __DEFAULT_FN_ATTRS
				2849	_mm256_loadu2_m128d(double const __addr_hi, double const __addr_lo)
				2850	{
				2851	__m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
				2852	return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
				2853	}
				2854
				2855	static __inline __m256i __DEFAULT_FN_ATTRS
				2856	_mm256_loadu2_m128i(__m128i const __addr_hi, __m128i const __addr_lo)
				2857	{
				2858	__m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
				2859	return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
				2860	}
				2861
				2862	/* SIMD store ops (unaligned) */
				2863	static __inline void __DEFAULT_FN_ATTRS
				2864	_mm256_storeu2_m128(float __addr_hi, float __addr_lo, __m256 __a)
				2865	{
				2866	__m128 __v128;
				2867
				2868	__v128 = _mm256_castps256_ps128(__a);
				2869	_mm_storeu_ps(__addr_lo, __v128);
				2870	__v128 = _mm256_extractf128_ps(__a, 1);
				2871	_mm_storeu_ps(__addr_hi, __v128);
				2872	}
				2873
				2874	static __inline void __DEFAULT_FN_ATTRS
				2875	_mm256_storeu2_m128d(double __addr_hi, double __addr_lo, __m256d __a)
				2876	{
				2877	__m128d __v128;
				2878
				2879	__v128 = _mm256_castpd256_pd128(__a);
				2880	_mm_storeu_pd(__addr_lo, __v128);
				2881	__v128 = _mm256_extractf128_pd(__a, 1);
				2882	_mm_storeu_pd(__addr_hi, __v128);
				2883	}
				2884
				2885	static __inline void __DEFAULT_FN_ATTRS
				2886	_mm256_storeu2_m128i(__m128i __addr_hi, __m128i __addr_lo, __m256i __a)
				2887	{
				2888	__m128i __v128;
				2889
				2890	__v128 = _mm256_castsi256_si128(__a);
				2891	_mm_storeu_si128(__addr_lo, __v128);
				2892	__v128 = _mm256_extractf128_si256(__a, 1);
				2893	_mm_storeu_si128(__addr_hi, __v128);
				2894	}
				2895
				2896	static __inline __m256 __DEFAULT_FN_ATTRS
				2897	_mm256_set_m128 (__m128 __hi, __m128 __lo) {
				2898	return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
				2899	}
				2900
				2901	static __inline __m256d __DEFAULT_FN_ATTRS
				2902	_mm256_set_m128d (__m128d __hi, __m128d __lo) {
				2903	return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				2904	}
				2905
				2906	static __inline __m256i __DEFAULT_FN_ATTRS
				2907	_mm256_set_m128i (__m128i __hi, __m128i __lo) {
				2908	return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				2909	}
				2910
				2911	static __inline __m256 __DEFAULT_FN_ATTRS
				2912	_mm256_setr_m128 (__m128 __lo, __m128 __hi) {
				2913	return _mm256_set_m128(__hi, __lo);
				2914	}
				2915
				2916	static __inline __m256d __DEFAULT_FN_ATTRS
				2917	_mm256_setr_m128d (__m128d __lo, __m128d __hi) {
				2918	return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				2919	}
				2920
				2921	static __inline __m256i __DEFAULT_FN_ATTRS
				2922	_mm256_setr_m128i (__m128i __lo, __m128i __hi) {
				2923	return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
				2924	}
				2925
				2926	#undef __DEFAULT_FN_ATTRS
				2927
				2928	#endif /* __AVXINTRIN_H */