pl/math/v_erfcf_1u.c - platform/external/arm-optimized-routines - Gitiles

 /*
  * Single-precision vector erfc(x) function.
  *
  * Copyright (c) 2021-2023, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */

 #include "v_math.h"
 #include "erfcf.h"
 #include "estrin.h"
 #include "pl_sig.h"
 #include "pl_test.h"

 #if V_SUPPORTED

 #define P(ia12) __erfcf_poly_data.poly[interval_index (ia12)]

 VPCS_ATTR v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t);

 static VPCS_ATTR NOINLINE v_f32_t
 specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
 {
   return v_call_f32 (erfcf, x, y, special);
 }

 static inline uint32_t
 interval_index (uint32_t ia12)
 {
   // clang-format off
   return (ia12 < 0x400 ? 0 :
          (ia12 < 0x408 ? 1 :
          (ia12 < 0x410 ? 2 :
                          3)));
   // clang-format on
 }

 /* The C macro wraps the coeffs argument in order to make the
    poynomial evaluation more readable. In the scalarised variant the
    second pointer is ignored.  */
 #ifdef SCALAR
 #define C(i) coeff1[i]
 #else
 #define C(i) ((v_f64_t){coeff1[i], coeff2[i]})
 #endif

 static inline v_f64_t
 v_approx_erfcf_poly_gauss (v_f64_t x, const double *coeff1,
 			   const double *coeff2)
 {
   v_f64_t x2 = x * x;
   v_f64_t x4 = x2 * x2;
   v_f64_t poly = ESTRIN_15 (x, x2, x4, x4 * x4, C);
   v_f64_t gauss = V_NAME (exp_tail) (-(x * x), v_f64 (0.0));
   return poly * gauss;
 }

 static inline float
 approx_poly_gauss (float abs_x, const double *coeff)
 {
   return (float) (eval_poly (abs_x, coeff) * eval_exp_mx2 (abs_x));
 }

 static v_f32_t
 v_approx_erfcf (v_f32_t abs_x, v_u32_t sign, v_u32_t ia12, v_u32_t lanes)
 {
 #ifdef SCALAR
   float y = approx_poly_gauss (abs_x, P (ia12));
   return sign ? 2 - y : y;
 #else
   float32x2_t lo32 = {0, 0};
   float32x2_t hi32 = {0, 0};
   /* The polynomial and Gaussian components must be calculated in
      double precision in order to meet the required ULP error. This
      means we have to promote low and high halves of the
      single-precision input vector to two separate double-precision
      input vectors. This incurs some overhead, and there is also
      overhead to loading the polynomial coefficients as this cannot be
      done in a vector fashion. This would be wasted effort for
      elements which lie in the 'boring' zone, as they will be
      overwritten later. Hence we use the lanes parameter to only do
      the promotion on a pair of lanes if both of those lanes are
      interesting and not special cases. If one lane is inactive, we
      use a scalar routine which is shared with the scalar variant.  */
   if (lanes[0] & lanes[1])
     {
       lo32 = vcvt_f32_f64 (
 	v_approx_erfcf_poly_gauss (vcvt_f64_f32 (vget_low_f32 (abs_x)),
 				   P (ia12[0]), P (ia12[1])));
     }
   else if (lanes[0])
     {
       lo32[0] = approx_poly_gauss (abs_x[0], P (ia12[0]));
     }
   else if (lanes[1])
     {
       lo32[1] = approx_poly_gauss (abs_x[1], P (ia12[1]));
     }

   if (lanes[2] & lanes[3])
     {
       hi32
 	= vcvt_f32_f64 (v_approx_erfcf_poly_gauss (vcvt_high_f64_f32 (abs_x),
 						   P (ia12[2]), P (ia12[3])));
     }
   else if (lanes[2])
     {
       hi32[0] = approx_poly_gauss (abs_x[2], P (ia12[2]));
     }
   else if (lanes[3])
     {
       hi32[1] = approx_poly_gauss (abs_x[3], P (ia12[3]));
     }

   v_f32_t y = vcombine_f32 (lo32, hi32);

   if (v_any_u32 (sign))
     {
       y = vbslq_f32 (vceqzq_u32 (sign), y, 2 - y);
     }

   return y;
 #endif
 }

 /* Optimized single-precision vector complementary error function
    erfcf. Max measured error: 0.750092 at various values between
    -0x1.06521p-20 and -0x1.add1dap-17. For example:
    __v_erfc(-0x1.08185p-18) got 0x1.00004cp+0 want 0x1.00004ap+0
    +0.249908 ulp err 0.250092.  */
 VPCS_ATTR
 v_f32_t V_NAME (erfcf) (v_f32_t x)
 {
   v_u32_t ix = v_as_u32_f32 (x);
   v_u32_t ia = ix & 0x7fffffff;
   v_u32_t ia12 = ia >> 20;
   v_u32_t sign = ix >> 31;
   v_u32_t inf_ia12 = v_u32 (0x7f8);

   v_u32_t special_cases
     = v_cond_u32 ((ia12 - 0x328) >= ((inf_ia12 & 0x7f8) - 0x328));
   v_u32_t in_bounds
     = v_cond_u32 ((ia < 0x408ccccd) | (~sign & (ix < 0x4120f5c3)));
   v_f32_t boring_zone = v_as_f32_u32 (sign << 30);

 #ifdef SCALAR
   if (unlikely (special_cases))
     {
       if (ia12 >= 0x7f8)
 	return (float) (sign << 1) + 1.0f / x; /* Special cases.  */
       else
 	return 1.0f - x; /* Small case.  */
     }
   else if (likely (!in_bounds))
     {
       return sign ? boring_zone : __math_uflowf (boring_zone);
     }
 #endif

   v_f32_t y = v_approx_erfcf (v_as_f32_u32 (ia), sign, ia12,
 			      in_bounds & ~special_cases);

 #ifndef SCALAR
   y = vbslq_f32 (~in_bounds, boring_zone, y);

   if (unlikely (v_any_u32 (special_cases)))
     {
       return specialcase (x, y, special_cases);
     }
 #endif

   return y;
 }
 VPCS_ALIAS

 PL_SIG (V, F, 1, erfc, -6.0, 28.0)
 PL_TEST_ULP (V_NAME (erfcf), 0.26)
 PL_TEST_INTERVAL (V_NAME (erfcf), 0, 0xffff0000, 10000)
 PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-127, 0x1p-26, 40000)
 PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-127, -0x1p-26, 40000)
 PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-26, 0x1p5, 40000)
 PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-26, -0x1p3, 40000)
 PL_TEST_INTERVAL (V_NAME (erfcf), 0, inf, 40000)
 #endif
	/*
	* Single-precision vector erfc(x) function.
	*
	* Copyright (c) 2021-2023, Arm Limited.
	* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
	*/

	#include "v_math.h"
	#include "erfcf.h"
	#include "estrin.h"
	#include "pl_sig.h"
	#include "pl_test.h"

	#if V_SUPPORTED

	#define P(ia12) __erfcf_poly_data.poly[interval_index (ia12)]

	VPCS_ATTR v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t);

	static VPCS_ATTR NOINLINE v_f32_t
	specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
	{
	return v_call_f32 (erfcf, x, y, special);
	}

	static inline uint32_t
	interval_index (uint32_t ia12)
	{
	// clang-format off
	return (ia12 < 0x400 ? 0 :
	(ia12 < 0x408 ? 1 :
	(ia12 < 0x410 ? 2 :
	3)));
	// clang-format on
	}

	/* The C macro wraps the coeffs argument in order to make the
	poynomial evaluation more readable. In the scalarised variant the
	second pointer is ignored. */
	#ifdef SCALAR
	#define C(i) coeff1[i]
	#else
	#define C(i) ((v_f64_t){coeff1[i], coeff2[i]})
	#endif

	static inline v_f64_t
	v_approx_erfcf_poly_gauss (v_f64_t x, const double *coeff1,
	const double *coeff2)
	{
	v_f64_t x2 = x * x;
	v_f64_t x4 = x2 * x2;
	v_f64_t poly = ESTRIN_15 (x, x2, x4, x4 * x4, C);
	v_f64_t gauss = V_NAME (exp_tail) (-(x * x), v_f64 (0.0));
	return poly * gauss;
	}

	static inline float
	approx_poly_gauss (float abs_x, const double *coeff)
	{
	return (float) (eval_poly (abs_x, coeff) * eval_exp_mx2 (abs_x));
	}

	static v_f32_t
	v_approx_erfcf (v_f32_t abs_x, v_u32_t sign, v_u32_t ia12, v_u32_t lanes)
	{
	#ifdef SCALAR
	float y = approx_poly_gauss (abs_x, P (ia12));
	return sign ? 2 - y : y;
	#else
	float32x2_t lo32 = {0, 0};
	float32x2_t hi32 = {0, 0};
	/* The polynomial and Gaussian components must be calculated in
	double precision in order to meet the required ULP error. This
	means we have to promote low and high halves of the
	single-precision input vector to two separate double-precision
	input vectors. This incurs some overhead, and there is also
	overhead to loading the polynomial coefficients as this cannot be
	done in a vector fashion. This would be wasted effort for
	elements which lie in the 'boring' zone, as they will be
	overwritten later. Hence we use the lanes parameter to only do
	the promotion on a pair of lanes if both of those lanes are
	interesting and not special cases. If one lane is inactive, we
	use a scalar routine which is shared with the scalar variant. */
	if (lanes[0] & lanes[1])
	{
	lo32 = vcvt_f32_f64 (
	v_approx_erfcf_poly_gauss (vcvt_f64_f32 (vget_low_f32 (abs_x)),
	P (ia12[0]), P (ia12[1])));
	}
	else if (lanes[0])
	{
	lo32[0] = approx_poly_gauss (abs_x[0], P (ia12[0]));
	}
	else if (lanes[1])
	{
	lo32[1] = approx_poly_gauss (abs_x[1], P (ia12[1]));
	}

	if (lanes[2] & lanes[3])
	{
	hi32
	= vcvt_f32_f64 (v_approx_erfcf_poly_gauss (vcvt_high_f64_f32 (abs_x),
	P (ia12[2]), P (ia12[3])));
	}
	else if (lanes[2])
	{
	hi32[0] = approx_poly_gauss (abs_x[2], P (ia12[2]));
	}
	else if (lanes[3])
	{
	hi32[1] = approx_poly_gauss (abs_x[3], P (ia12[3]));
	}

	v_f32_t y = vcombine_f32 (lo32, hi32);

	if (v_any_u32 (sign))
	{
	y = vbslq_f32 (vceqzq_u32 (sign), y, 2 - y);
	}

	return y;
	#endif
	}

	/* Optimized single-precision vector complementary error function
	erfcf. Max measured error: 0.750092 at various values between
	-0x1.06521p-20 and -0x1.add1dap-17. For example:
	__v_erfc(-0x1.08185p-18) got 0x1.00004cp+0 want 0x1.00004ap+0
	+0.249908 ulp err 0.250092. */
	VPCS_ATTR
	v_f32_t V_NAME (erfcf) (v_f32_t x)
	{
	v_u32_t ix = v_as_u32_f32 (x);
	v_u32_t ia = ix & 0x7fffffff;
	v_u32_t ia12 = ia >> 20;
	v_u32_t sign = ix >> 31;
	v_u32_t inf_ia12 = v_u32 (0x7f8);

	v_u32_t special_cases
	= v_cond_u32 ((ia12 - 0x328) >= ((inf_ia12 & 0x7f8) - 0x328));
	v_u32_t in_bounds
	= v_cond_u32 ((ia < 0x408ccccd) \| (~sign & (ix < 0x4120f5c3)));
	v_f32_t boring_zone = v_as_f32_u32 (sign << 30);

	#ifdef SCALAR
	if (unlikely (special_cases))
	{
	if (ia12 >= 0x7f8)
	return (float) (sign << 1) + 1.0f / x; /* Special cases. */
	else
	return 1.0f - x; /* Small case. */
	}
	else if (likely (!in_bounds))
	{
	return sign ? boring_zone : __math_uflowf (boring_zone);
	}
	#endif

	v_f32_t y = v_approx_erfcf (v_as_f32_u32 (ia), sign, ia12,
	in_bounds & ~special_cases);

	#ifndef SCALAR
	y = vbslq_f32 (~in_bounds, boring_zone, y);

	if (unlikely (v_any_u32 (special_cases)))
	{
	return specialcase (x, y, special_cases);
	}
	#endif

	return y;
	}
	VPCS_ALIAS

	PL_SIG (V, F, 1, erfc, -6.0, 28.0)
	PL_TEST_ULP (V_NAME (erfcf), 0.26)
	PL_TEST_INTERVAL (V_NAME (erfcf), 0, 0xffff0000, 10000)
	PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-127, 0x1p-26, 40000)
	PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-127, -0x1p-26, 40000)
	PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-26, 0x1p5, 40000)
	PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-26, -0x1p3, 40000)
	PL_TEST_INTERVAL (V_NAME (erfcf), 0, inf, 40000)
	#endif