blob: 963490d789bd896e828c73d7f48c63a09d038639 [file] [log] [blame]
/*
* Single-precision vector erfc(x) function.
*
* Copyright (c) 2021-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
#include "erfcf.h"
#include "estrin.h"
#include "pl_sig.h"
#include "pl_test.h"
#if V_SUPPORTED
#define P(ia12) __erfcf_poly_data.poly[interval_index (ia12)]
VPCS_ATTR v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t);
static VPCS_ATTR NOINLINE v_f32_t
specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
{
return v_call_f32 (erfcf, x, y, special);
}
static inline uint32_t
interval_index (uint32_t ia12)
{
// clang-format off
return (ia12 < 0x400 ? 0 :
(ia12 < 0x408 ? 1 :
(ia12 < 0x410 ? 2 :
3)));
// clang-format on
}
/* The C macro wraps the coeffs argument in order to make the
poynomial evaluation more readable. In the scalarised variant the
second pointer is ignored. */
#ifdef SCALAR
#define C(i) coeff1[i]
#else
#define C(i) ((v_f64_t){coeff1[i], coeff2[i]})
#endif
static inline v_f64_t
v_approx_erfcf_poly_gauss (v_f64_t x, const double *coeff1,
const double *coeff2)
{
v_f64_t x2 = x * x;
v_f64_t x4 = x2 * x2;
v_f64_t poly = ESTRIN_15 (x, x2, x4, x4 * x4, C);
v_f64_t gauss = V_NAME (exp_tail) (-(x * x), v_f64 (0.0));
return poly * gauss;
}
static inline float
approx_poly_gauss (float abs_x, const double *coeff)
{
return (float) (eval_poly (abs_x, coeff) * eval_exp_mx2 (abs_x));
}
static v_f32_t
v_approx_erfcf (v_f32_t abs_x, v_u32_t sign, v_u32_t ia12, v_u32_t lanes)
{
#ifdef SCALAR
float y = approx_poly_gauss (abs_x, P (ia12));
return sign ? 2 - y : y;
#else
float32x2_t lo32 = {0, 0};
float32x2_t hi32 = {0, 0};
/* The polynomial and Gaussian components must be calculated in
double precision in order to meet the required ULP error. This
means we have to promote low and high halves of the
single-precision input vector to two separate double-precision
input vectors. This incurs some overhead, and there is also
overhead to loading the polynomial coefficients as this cannot be
done in a vector fashion. This would be wasted effort for
elements which lie in the 'boring' zone, as they will be
overwritten later. Hence we use the lanes parameter to only do
the promotion on a pair of lanes if both of those lanes are
interesting and not special cases. If one lane is inactive, we
use a scalar routine which is shared with the scalar variant. */
if (lanes[0] & lanes[1])
{
lo32 = vcvt_f32_f64 (
v_approx_erfcf_poly_gauss (vcvt_f64_f32 (vget_low_f32 (abs_x)),
P (ia12[0]), P (ia12[1])));
}
else if (lanes[0])
{
lo32[0] = approx_poly_gauss (abs_x[0], P (ia12[0]));
}
else if (lanes[1])
{
lo32[1] = approx_poly_gauss (abs_x[1], P (ia12[1]));
}
if (lanes[2] & lanes[3])
{
hi32
= vcvt_f32_f64 (v_approx_erfcf_poly_gauss (vcvt_high_f64_f32 (abs_x),
P (ia12[2]), P (ia12[3])));
}
else if (lanes[2])
{
hi32[0] = approx_poly_gauss (abs_x[2], P (ia12[2]));
}
else if (lanes[3])
{
hi32[1] = approx_poly_gauss (abs_x[3], P (ia12[3]));
}
v_f32_t y = vcombine_f32 (lo32, hi32);
if (v_any_u32 (sign))
{
y = vbslq_f32 (vceqzq_u32 (sign), y, 2 - y);
}
return y;
#endif
}
/* Optimized single-precision vector complementary error function
erfcf. Max measured error: 0.750092 at various values between
-0x1.06521p-20 and -0x1.add1dap-17. For example:
__v_erfc(-0x1.08185p-18) got 0x1.00004cp+0 want 0x1.00004ap+0
+0.249908 ulp err 0.250092. */
VPCS_ATTR
v_f32_t V_NAME (erfcf) (v_f32_t x)
{
v_u32_t ix = v_as_u32_f32 (x);
v_u32_t ia = ix & 0x7fffffff;
v_u32_t ia12 = ia >> 20;
v_u32_t sign = ix >> 31;
v_u32_t inf_ia12 = v_u32 (0x7f8);
v_u32_t special_cases
= v_cond_u32 ((ia12 - 0x328) >= ((inf_ia12 & 0x7f8) - 0x328));
v_u32_t in_bounds
= v_cond_u32 ((ia < 0x408ccccd) | (~sign & (ix < 0x4120f5c3)));
v_f32_t boring_zone = v_as_f32_u32 (sign << 30);
#ifdef SCALAR
if (unlikely (special_cases))
{
if (ia12 >= 0x7f8)
return (float) (sign << 1) + 1.0f / x; /* Special cases. */
else
return 1.0f - x; /* Small case. */
}
else if (likely (!in_bounds))
{
return sign ? boring_zone : __math_uflowf (boring_zone);
}
#endif
v_f32_t y = v_approx_erfcf (v_as_f32_u32 (ia), sign, ia12,
in_bounds & ~special_cases);
#ifndef SCALAR
y = vbslq_f32 (~in_bounds, boring_zone, y);
if (unlikely (v_any_u32 (special_cases)))
{
return specialcase (x, y, special_cases);
}
#endif
return y;
}
VPCS_ALIAS
PL_SIG (V, F, 1, erfc, -6.0, 28.0)
PL_TEST_ULP (V_NAME (erfcf), 0.26)
PL_TEST_INTERVAL (V_NAME (erfcf), 0, 0xffff0000, 10000)
PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-127, 0x1p-26, 40000)
PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-127, -0x1p-26, 40000)
PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-26, 0x1p5, 40000)
PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-26, -0x1p3, 40000)
PL_TEST_INTERVAL (V_NAME (erfcf), 0, inf, 40000)
#endif