Snap for 10447354 from 172d24a7ae67ee7bae413d5a8618f1b5edc002be to mainline-wifi-release

Change-Id: Ic30b2e20af121e229db4c369ed03b47663a2f9db
diff --git a/Android.bp b/Android.bp
old mode 100755
new mode 100644
index 8fe656e..62e26e7
--- a/Android.bp
+++ b/Android.bp
@@ -109,36 +109,96 @@
         arm64: {
             srcs: [
                 "string/aarch64/memchr-mte.S",
+                "string/aarch64/memchr-sve.S",
                 "string/aarch64/memchr.S",
+                "string/aarch64/memcmp-sve.S",
                 "string/aarch64/memcmp.S",
+                "string/aarch64/memcpy-advsimd.S",
+                "string/aarch64/memcpy-sve.S",
+                "string/aarch64/memcpy.S",
                 "string/aarch64/memrchr.S",
-                "string/aarch64/stpcpy-mte.S",
+                "string/aarch64/memset.S",
+                "string/aarch64/stpcpy-sve.S",
                 "string/aarch64/stpcpy.S",
                 "string/aarch64/strchrnul-mte.S",
+                "string/aarch64/strchrnul-sve.S",
                 "string/aarch64/strchrnul.S",
                 "string/aarch64/strchr-mte.S",
+                "string/aarch64/strchr-sve.S",
                 "string/aarch64/strchr.S",
-                "string/aarch64/strcmp-mte.S",
+                "string/aarch64/strcmp-sve.S",
                 "string/aarch64/strcmp.S",
-                "string/aarch64/strcpy-mte.S",
+                "string/aarch64/strcpy-sve.S",
                 "string/aarch64/strcpy.S",
                 "string/aarch64/strlen-mte.S",
+                "string/aarch64/strlen-sve.S",
                 "string/aarch64/strlen.S",
-                "string/aarch64/strncmp-mte.S",
+                "string/aarch64/strncmp-sve.S",
                 "string/aarch64/strncmp.S",
+                "string/aarch64/strnlen-sve.S",
                 "string/aarch64/strnlen.S",
                 "string/aarch64/strrchr-mte.S",
+                "string/aarch64/strrchr-sve.S",
                 "string/aarch64/strrchr.S",
             ],
             asflags: [
-                "-D__memcmp_aarch64=memcmp",
+                "-march=armv8-a+sve",
+                "-D__memset_aarch64=memset",
                 "-D__memrchr_aarch64=memrchr",
-                "-D__strnlen_aarch64=strnlen",
             ]
         },
     },
 }
 
+// Memory intrinsics for bare-metal Rust binaries.
+cc_library_static {
+    name: "libarm-optimized-routines-mem",
+    nocrt: true,
+    system_shared_libs: [],
+    stl: "none",
+    sanitize: {
+        hwaddress: false,
+    },
+    arch: {
+        arm64: {
+            srcs: [
+                "string/aarch64/memchr.S",
+                "string/aarch64/memcmp.S",
+                "string/aarch64/memcpy.S",
+                "string/aarch64/memrchr.S",
+                "string/aarch64/memset.S",
+                "string/aarch64/stpcpy.S",
+                "string/aarch64/strchr.S",
+                "string/aarch64/strchrnul.S",
+                "string/aarch64/strcmp.S",
+                "string/aarch64/strcpy.S",
+                "string/aarch64/strlen.S",
+                "string/aarch64/strncmp.S",
+                "string/aarch64/strnlen.S",
+                "string/aarch64/strrchr.S",
+            ],
+            asflags: [
+                "-D__memchr_aarch64=memchr",
+                "-D__memcmp_aarch64=memcmp",
+                "-D__memcpy_aarch64=memcpy",
+                "-D__memmove_aarch64=memmove",
+                "-D__memrchr_aarch64=memrchr",
+                "-D__memset_aarch64=memset",
+                "-D__stpcpy_aarch64=stpcpy",
+                "-D__strchr_aarch64=strchr",
+                "-D__strchrnul_aarch64=strchrnul",
+                "-D__strcmp_aarch64=strcmp",
+                "-D__strcpy_aarch64=strcpy",
+                "-D__strlen_aarch64=strlen",
+                "-D__strncmp_aarch64=strncmp",
+                "-D__strnlen_aarch64=strnlen",
+                "-D__strrchr_aarch64=strrchr",
+            ],
+        },
+    },
+    visibility: ["//bionic/libc"],
+}
+
 // adb shell "/data/nativetest64/mathtest/mathtest /data/nativetest64/mathtest/test/testcases/directed/*"
 // adb shell "/data/nativetest/mathtest/mathtest /data/nativetest/mathtest/test/testcases/directed/*"
 cc_test {
@@ -162,6 +222,10 @@
     name: "ulp",
     defaults: ["arm-optimized-routines-defaults"],
     gtest: false,
+
+    // https://github.com/ARM-software/optimized-routines/issues/53
+    local_include_dirs: ["math/"],
+
     srcs: ["math/test/ulp.c"],
     data: ["math/test/runulp.sh"],
 }
diff --git a/LICENSE b/LICENSE
index 2543b82..20a4b77 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,11 @@
-MIT License
+MIT OR Apache-2.0 WITH LLVM-exception
+=====================================
 
-Copyright (c) 1999-2019, Arm Limited.
+
+MIT License
+-----------
+
+Copyright (c) 1999-2022, Arm Limited.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -19,3 +24,226 @@
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
+
+
+Apache-2.0 WITH LLVM-exception
+------------------------------
+
+                                Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+    1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+    2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+    3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+    4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+    5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+    6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+    7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+    8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+    9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+    END OF TERMS AND CONDITIONS
+
+    APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+    Copyright [yyyy] [name of copyright owner]
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+
+--- LLVM Exceptions to the Apache 2.0 License ----
+
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
diff --git a/MAINTAINERS b/MAINTAINERS
new file mode 100644
index 0000000..6c5823a
--- /dev/null
+++ b/MAINTAINERS
@@ -0,0 +1,12 @@
+/
+	Szabolcs Nagy <szabolcs.nagy@arm.com>
+math/
+	Szabolcs Nagy <szabolcs.nagy@arm.com>
+networking/
+	Szabolcs Nagy <szabolcs.nagy@arm.com>
+pl/
+	Pierre Blanchard <pierre.blanchard@arm.com>
+	Joe Ramsay <joe.ramsay@arm.com>
+string/
+	Szabolcs Nagy <szabolcs.nagy@arm.com>
+	Wilco Dijkstra <wilco.dijkstra@arm.com>
diff --git a/METADATA b/METADATA
index 59af591..97f7332 100644
--- a/METADATA
+++ b/METADATA
@@ -1,3 +1,7 @@
+# This project was upgraded with external_updater.
+# Usage: tools/external_updater/updater.sh update arm-optimized-routines
+# For more info, check https://cs.android.com/android/platform/superproject/+/master:tools/external_updater/README.md
+
 name: "ARM-software/optimized-routines"
 description: "Optimized implementations of various library functions for ARM architecture processors "
 third_party {
@@ -9,11 +13,11 @@
     type: GIT
     value: "https://github.com/ARM-software/optimized-routines.git"
   }
-  version: "v21.02"
+  version: "v23.01"
   license_type: NOTICE
   last_upgrade_date {
-    year: 2021
-    month: 2
-    day: 18
+    year: 2023
+    month: 1
+    day: 25
   }
 }
diff --git a/Makefile b/Makefile
index 169f89e..c487896 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 # Makefile - requires GNU make
 #
-# Copyright (c) 2018-2020, Arm Limited.
-# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2022, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 srcdir = .
 prefix = /usr
@@ -11,6 +11,7 @@
 
 # Configure these in config.mk, do not make changes in this file.
 SUBS = math string networking
+PLSUBS = math
 HOST_CC = cc
 HOST_CFLAGS = -std=c99 -O2
 HOST_LDFLAGS =
@@ -20,6 +21,7 @@
 CFLAGS = -std=c99 -O2
 CFLAGS_SHARED = -fPIC
 CFLAGS_ALL = -Ibuild/include $(CPPFLAGS) $(CFLAGS)
+CFLAGS_PL = -Ibuild/pl/include $(CPPFLAGS) $(CFLAGS) -DPL
 LDFLAGS =
 LDLIBS =
 AR = $(CROSS_COMPILE)ar
@@ -51,6 +53,7 @@
 	mkdir -p $@
 
 $(filter %.os,$(ALL_FILES)): CFLAGS_ALL += $(CFLAGS_SHARED)
+$(filter %.os,$(ALL_FILES)): CFLAGS_PL += $(CFLAGS_SHARED)
 
 build/%.o: $(srcdir)/%.S
 	$(CC) $(CFLAGS_ALL) -c -o $@ $<
diff --git a/README b/README
index ae465e9..a2143a2 100644
--- a/README
+++ b/README
@@ -2,14 +2,17 @@
 ----------------------
 
 This repository contains implementations of library functions
-provided by Arm under MIT License (See LICENSE). Contributions
-to this project are accepted, but Contributors have to sign an
-Assignment Agreement, please follow the instructions in
+provided by Arm. The outbound license is available under a dual
+license, at the user’s election, as reflected in the LICENSE file.
+Contributions to this project are accepted, but Contributors have
+to sign an Assignment Agreement, please follow the instructions in
 contributor-agreement.pdf. This is needed so upstreaming code
-to projects that require copyright assignment is possible.
+to projects that require copyright assignment is possible. Further
+contribution requirements are documented in README.contributors of
+the appropriate subdirectory.
 
 Regular quarterly releases are tagged as vYY.MM, the latest
-release is v20.11.
+release is v23.01.
 
 Source code layout:
 
@@ -24,6 +27,7 @@
 string/         - string routines subproject sources.
 string/include/ - string library public headers.
 string/test/    - string test and benchmark related sources.
+pl/...          - separately maintained performance library code.
 
 The steps to build the target libraries and run the tests:
 
diff --git a/README.contributors b/README.contributors
new file mode 100644
index 0000000..f8fcdde
--- /dev/null
+++ b/README.contributors
@@ -0,0 +1,44 @@
+GENERIC CONTRIBUTION GUIDELINES
+===============================
+
+1. Sub-projects are maintained independently and thus have independent
+   contribution rules. If there exists a README.contributors in the
+   sub-directory to which the contribution is made, it must be followed.
+
+2. Legal:
+   - Contributors who are not employed by Arm must sign an Assignment Agreement.
+     See contributor-agreement.pdf.
+   - All code must be copyright owned by Arm Limited and the appropriate
+     copyright notice and license identifier must be present in every source
+     file.
+
+3. Build:
+   - Build should only depend on GNU make and posix utilities (shell, awk, sed,
+     etc) and on a C toolchain.
+   - Build should pass with the default configuration (see config.mk.dist)
+     and other supported configurations, with both gcc and clang based
+     toolchains. (The build should not depend on a recent toolchain, the use
+     of a new feature should be possible to disable.)
+   - Currently there is no automated configuration, target specific configuration
+     should be done via make variables in config.mk. This is the user interface
+     to the build system, so it should be documented in sufficient detail and
+     kept reasonably stable.
+
+4. Testing:
+   - On aarch64 the tests must pass. If the code may behave differently under
+     some supported configurations (e.g. CFLAGS) those should be tested.
+   - New symbols are expected to have new associated test code and ideally
+     benchmark code too.
+
+4. Commits:
+   - Commit message should be descriptive and should not refer to Arm internal
+     information (such as Jira tickets, or internal discussions). Non-obvious
+     decisions should be recorded or explained in the commit message if they are
+     not explained in source comments.
+   - Ideally tools and scripts used to write the code should be added to the
+     repository or at least mentioned in the commit.
+   - Logically independent changes should not be mixed into the same commit.
+
+5. Style:
+   - Unless otherwise required differently by the sub-project, follow the
+     clang-format tool using the style from the gcc contrib/ directory.
diff --git a/config.mk.dist b/config.mk.dist
index 177e1ac..7a84975 100644
--- a/config.mk.dist
+++ b/config.mk.dist
@@ -1,11 +1,14 @@
 # Example config.mk
 #
-# Copyright (c) 2018-2020, Arm Limited.
-# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2022, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 # Subprojects to build
 SUBS = math string networking
 
+# Subsubprojects to build if subproject pl is built
+PLSUBS = math
+
 # Target architecture: aarch64, arm or x86_64
 ARCH = aarch64
 
@@ -59,6 +62,23 @@
 # Disable vector math code
 #math-cflags += -DWANT_VMATH=0
 
+# Disable/enable SVE vector math code and tests
+WANT_SVE_MATH = 0
+ifeq ($(WANT_SVE_MATH), 1)
+  math-cflags += -march=armv8.2-a+sve
+endif
+math-cflags += -DWANT_SVE_MATH=$(WANT_SVE_MATH)
+
+# If defined to 1, set errno in math functions according to ISO C.  Many math
+# libraries do not set errno, so this is 0 by default.  It may need to be
+# set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0.
+WANT_ERRNO = 0
+math-cflags += -DWANT_ERRNO=$(WANT_ERRNO)
+
+# If set to 1, set fenv in vector math routines.
+WANT_SIMD_EXCEPT = 0
+math-cflags += -DWANT_SIMD_EXCEPT=$(WANT_SIMD_EXCEPT)
+
 # Disable fenv checks
 #math-ulpflags = -q -f
 #math-testflags = -nostatus
diff --git a/math/Dir.mk b/math/Dir.mk
index 3b841ab..2a9cad1 100644
--- a/math/Dir.mk
+++ b/math/Dir.mk
@@ -1,7 +1,7 @@
 # Makefile fragment - requires GNU make
 #
-# Copyright (c) 2019, Arm Limited.
-# SPDX-License-Identifier: MIT
+# Copyright (c) 2019-2022, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 S := $(srcdir)/math
 B := build/math
@@ -15,6 +15,7 @@
 math-test-host-srcs := $(wildcard $(S)/test/rtest/*.[cS])
 
 math-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h))
+math-test-includes := $(patsubst $(S)/%,build/include/%,$(wildcard $(S)/test/*.h))
 
 math-libs := \
 	build/lib/libmathlib.so \
@@ -42,10 +43,11 @@
 	$(math-tools) \
 	$(math-host-tools) \
 	$(math-includes) \
+	$(math-test-includes) \
 
-all-math: $(math-libs) $(math-tools) $(math-includes)
+all-math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes)
 
-$(math-objs): $(math-includes)
+$(math-objs): $(math-includes) $(math-test-includes)
 $(math-objs): CFLAGS_ALL += $(math-cflags)
 $(B)/test/mathtest.o: CFLAGS_ALL += -fmath-errno
 $(math-host-objs): CC = $(HOST_CC)
@@ -83,6 +85,9 @@
 build/include/%.h: $(S)/include/%.h
 	cp $< $@
 
+build/include/test/%.h: $(S)/test/%.h
+	cp $< $@
+
 build/bin/%.sh: $(S)/test/%.sh
 	cp $< $@
 
@@ -96,7 +101,7 @@
 	cat $(math-rtests) | build/bin/rtest | $(EMULATOR) build/bin/mathtest $(math-testflags)
 
 check-math-ulp: $(math-tools)
-	ULPFLAGS="$(math-ulpflags)" build/bin/runulp.sh $(EMULATOR)
+	ULPFLAGS="$(math-ulpflags)" WANT_SIMD_EXCEPT="$(WANT_SIMD_EXCEPT)" build/bin/runulp.sh $(EMULATOR)
 
 check-math: check-math-test check-math-rtest check-math-ulp
 
diff --git a/math/README.contributors b/math/README.contributors
new file mode 100644
index 0000000..33e7ba3
--- /dev/null
+++ b/math/README.contributors
@@ -0,0 +1,78 @@
+STYLE REQUIREMENTS
+==================
+
+1. Most code in this sub-directory is expected to be upstreamed into glibc so
+   the GNU Coding Standard and glibc specific conventions should be followed
+   to ease upstreaming.
+
+2. ABI and symbols: the code should be written so it is suitable for inclusion
+   into a libc with minimal changes. This e.g. means that internal symbols
+   should be hidden and in the implementation reserved namespace according to
+   ISO C and POSIX rules. If possible the built shared libraries and static
+   library archives should be usable to override libc symbols at link time (or
+   at runtime via LD_PRELOAD). This requires the symbols to follow the glibc ABI
+   (other than symbol versioning), this cannot be done reliably for static
+   linking so this is a best effort requirement.
+
+3. API: include headers should be suitable for benchmarking and testing code
+   and should not conflict with libc headers.
+
+
+CONTRIBUTION GUIDELINES FOR math SUB-DIRECTORY
+==============================================
+
+1. Math functions have quality and performance requirements.
+
+2. Quality:
+   - Worst-case ULP error should be small in the entire input domain (for most
+     common double precision scalar functions the target is < 0.66 ULP error,
+     and < 1 ULP for single precision, even performance optimized function
+     variant should not have > 5 ULP error if the goal is to be a drop in
+     replacement for a standard math function), this should be tested
+     statistically (or on all inputs if possible in reasonable amount of time).
+     The ulp tool is for this and runulp.sh should be updated for new functions.
+
+   - All standard rounding modes need to be supported but in non-default rounding
+     modes the quality requirement can be relaxed. (Non-nearest rounded
+     computation can be slow and inaccurate but has to be correct for conformance
+     reasons.)
+
+   - Special cases and error handling need to follow ISO C Annex F requirements,
+     POSIX requirements, IEEE 754-2008 requirements and Glibc requiremnts:
+     https://www.gnu.org/software/libc/manual/html_mono/libc.html#Errors-in-Math-Functions
+     this should be tested by direct tests (glibc test system may be used for it).
+
+   - Error handling code should be decoupled from the approximation code as much
+     as possible. (There are helper functions, these take care of errno as well
+     as exception raising.)
+
+   - Vector math code does not need to work in non-nearest rounding mode and error
+     handling side effects need not happen (fenv exceptions and errno), but the
+     result should be correct (within quality requirements, which are lower for
+     vector code than for scalar code).
+
+   - Error bounds of the approximation should be clearly documented.
+
+   - The code should build and pass tests on arm, aarch64 and x86_64 GNU linux
+     systems. (Routines and features can be disabled on specific targets, but
+     the build must complete). On aarch64, both little- and big-endian targets
+     are supported as well as valid combinations of architecture extensions.
+     The configurations that should be tested depend on the contribution.
+
+3. Performance:
+   - Common math code should be benchmarked on modern aarch64 microarchitectures
+     over typical inputs.
+
+   - Performance improvements should be documented (relative numbers can be
+     published; it is enough to use the mathbench microbenchmark tool which should
+     be updated for new functions).
+
+   - Attention should be paid to the compilation flags: for aarch64 fma
+     contraction should be on and math errno turned off so some builtins can be
+     inlined.
+
+   - The code should be reasonably performant on x86_64 too, e.g. some rounding
+     instructions and fma may not be available on x86_64, such builtins turn into
+     libc calls with slow code. Such slowdown is not acceptable, a faster fallback
+     should be present: glibc and bionic use the same code on all targets. (This
+     does not apply to vector math code).
diff --git a/math/cosf.c b/math/cosf.c
index f29f194..6293ce8 100644
--- a/math/cosf.c
+++ b/math/cosf.c
@@ -1,8 +1,8 @@
 /*
  * Single-precision cos function.
  *
- * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
@@ -22,7 +22,7 @@
   int n;
   const sincos_t *p = &__sincosf_table[0];
 
-  if (abstop12 (y) < abstop12 (pio4))
+  if (abstop12 (y) < abstop12 (pio4f))
     {
       double x2 = x * x;
 
diff --git a/math/erf.c b/math/erf.c
index 12d7e51..5f9f40d 100644
--- a/math/erf.c
+++ b/math/erf.c
@@ -2,7 +2,7 @@
  * Double-precision erf(x) function.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/erf_data.c b/math/erf_data.c
index 807875b..10cf1fa 100644
--- a/math/erf_data.c
+++ b/math/erf_data.c
@@ -2,7 +2,7 @@
  * Shared data between erf and erfc.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/erff.c b/math/erff.c
index a58e825..9fa476d 100644
--- a/math/erff.c
+++ b/math/erff.c
@@ -2,7 +2,7 @@
  * Single-precision erf(x) function.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/math/erff_data.c b/math/erff_data.c
index fa6b1ef..f822788 100644
--- a/math/erff_data.c
+++ b/math/erff_data.c
@@ -2,7 +2,7 @@
  * Data for approximation of erff.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/exp.c b/math/exp.c
index 7f5024c..1de500c 100644
--- a/math/exp.c
+++ b/math/exp.c
@@ -2,7 +2,7 @@
  * Double-precision e^x function.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <float.h>
diff --git a/math/exp2.c b/math/exp2.c
index 35ab39f..a1eee44 100644
--- a/math/exp2.c
+++ b/math/exp2.c
@@ -2,7 +2,7 @@
  * Double-precision 2^x function.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <float.h>
diff --git a/math/exp2f.c b/math/exp2f.c
index 94b3253..776c3dd 100644
--- a/math/exp2f.c
+++ b/math/exp2f.c
@@ -2,7 +2,7 @@
  * Single-precision 2^x function.
  *
  * Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
diff --git a/math/exp2f_data.c b/math/exp2f_data.c
index 3fb0ad1..f0cb7fc 100644
--- a/math/exp2f_data.c
+++ b/math/exp2f_data.c
@@ -2,7 +2,7 @@
  * Shared data between expf, exp2f and powf.
  *
  * Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/exp_data.c b/math/exp_data.c
index cba7683..714c845 100644
--- a/math/exp_data.c
+++ b/math/exp_data.c
@@ -2,7 +2,7 @@
  * Shared data between exp, exp2 and pow.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/expf.c b/math/expf.c
index 9b2f0c3..08a20d5 100644
--- a/math/expf.c
+++ b/math/expf.c
@@ -2,7 +2,7 @@
  * Single-precision e^x function.
  *
  * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
diff --git a/math/include/mathlib.h b/math/include/mathlib.h
index 279d829..c520c37 100644
--- a/math/include/mathlib.h
+++ b/math/include/mathlib.h
@@ -2,7 +2,7 @@
  * Public API.
  *
  * Copyright (c) 2015-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _MATHLIB_H
diff --git a/math/log.c b/math/log.c
index d3b7bc6..43dfc2a 100644
--- a/math/log.c
+++ b/math/log.c
@@ -2,7 +2,7 @@
  * Double-precision log(x) function.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <float.h>
diff --git a/math/log2.c b/math/log2.c
index 55102b7..3f9c21b 100644
--- a/math/log2.c
+++ b/math/log2.c
@@ -2,7 +2,7 @@
  * Double-precision log2(x) function.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <float.h>
diff --git a/math/log2_data.c b/math/log2_data.c
index 3fc9b47..293bd7d 100644
--- a/math/log2_data.c
+++ b/math/log2_data.c
@@ -2,7 +2,7 @@
  * Data for log2.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/log2f.c b/math/log2f.c
index acb629e..0a44fa2 100644
--- a/math/log2f.c
+++ b/math/log2f.c
@@ -2,7 +2,7 @@
  * Single-precision log2 function.
  *
  * Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
diff --git a/math/log2f_data.c b/math/log2f_data.c
index f3546d7..4866ef7 100644
--- a/math/log2f_data.c
+++ b/math/log2f_data.c
@@ -2,7 +2,7 @@
  * Data definition for log2f.
  *
  * Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/log_data.c b/math/log_data.c
index 96a098d..3ecc1f4 100644
--- a/math/log_data.c
+++ b/math/log_data.c
@@ -2,7 +2,7 @@
  * Data for log.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/logf.c b/math/logf.c
index cfbaee1..820f74c 100644
--- a/math/logf.c
+++ b/math/logf.c
@@ -1,8 +1,8 @@
 /*
  * Single-precision log function.
  *
- * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2017-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
@@ -57,7 +57,7 @@
   tmp = ix - OFF;
   i = (tmp >> (23 - LOGF_TABLE_BITS)) % N;
   k = (int32_t) tmp >> 23; /* arithmetic shift */
-  iz = ix - (tmp & 0x1ff << 23);
+  iz = ix - (tmp & 0xff800000);
   invc = T[i].invc;
   logc = T[i].logc;
   z = (double_t) asfloat (iz);
diff --git a/math/logf_data.c b/math/logf_data.c
index e8973ce..0424768 100644
--- a/math/logf_data.c
+++ b/math/logf_data.c
@@ -2,7 +2,7 @@
  * Data definition for logf.
  *
  * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/math_config.h b/math/math_config.h
index e851043..7ffc0cd 100644
--- a/math/math_config.h
+++ b/math/math_config.h
@@ -2,7 +2,7 @@
  * Configuration for math routines.
  *
  * Copyright (c) 2017-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _MATH_CONFIG_H
diff --git a/math/math_err.c b/math/math_err.c
index 1bf9538..cfe0728 100644
--- a/math/math_err.c
+++ b/math/math_err.c
@@ -2,7 +2,7 @@
  * Double-precision math error handling.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/math_errf.c b/math/math_errf.c
index d5350b8..4233918 100644
--- a/math/math_errf.c
+++ b/math/math_errf.c
@@ -2,7 +2,7 @@
  * Single-precision math error handling.
  *
  * Copyright (c) 2017-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/pow.c b/math/pow.c
index 86842c6..af719fe 100644
--- a/math/pow.c
+++ b/math/pow.c
@@ -2,7 +2,7 @@
  * Double-precision x^y function.
  *
  * Copyright (c) 2018-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <float.h>
diff --git a/math/pow_log_data.c b/math/pow_log_data.c
index 45569c5..2a4c250 100644
--- a/math/pow_log_data.c
+++ b/math/pow_log_data.c
@@ -2,7 +2,7 @@
  * Data for the log part of pow.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/powf.c b/math/powf.c
index 6ba45d3..05c80bb 100644
--- a/math/powf.c
+++ b/math/powf.c
@@ -2,7 +2,7 @@
  * Single-precision pow function.
  *
  * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
diff --git a/math/powf_log2_data.c b/math/powf_log2_data.c
index 97e0d98..243836a 100644
--- a/math/powf_log2_data.c
+++ b/math/powf_log2_data.c
@@ -2,7 +2,7 @@
  * Data definition for powf.
  *
  * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/s_cos.c b/math/s_cos.c
index 53a95b0..e66d563 100644
--- a/math/s_cos.c
+++ b/math/s_cos.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_cos.c"
diff --git a/math/s_cosf.c b/math/s_cosf.c
index 914c02e..f615d26 100644
--- a/math/s_cosf.c
+++ b/math/s_cosf.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_cosf.c"
diff --git a/math/s_exp.c b/math/s_exp.c
index ac7246b..5da0099 100644
--- a/math/s_exp.c
+++ b/math/s_exp.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_exp.c"
diff --git a/math/s_exp2f.c b/math/s_exp2f.c
index df7dfd6..dcbfea9 100644
--- a/math/s_exp2f.c
+++ b/math/s_exp2f.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_exp2f.c"
diff --git a/math/s_exp2f_1u.c b/math/s_exp2f_1u.c
index 5e3852b..bf387e4 100644
--- a/math/s_exp2f_1u.c
+++ b/math/s_exp2f_1u.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_exp2f_1u.c"
diff --git a/math/s_expf.c b/math/s_expf.c
index 3492c46..dacda7f 100644
--- a/math/s_expf.c
+++ b/math/s_expf.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_expf.c"
diff --git a/math/s_expf_1u.c b/math/s_expf_1u.c
index eb7bbcb..0009644 100644
--- a/math/s_expf_1u.c
+++ b/math/s_expf_1u.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_expf_1u.c"
diff --git a/math/s_log.c b/math/s_log.c
index 23289cf..27d2eb2 100644
--- a/math/s_log.c
+++ b/math/s_log.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_log.c"
diff --git a/math/s_logf.c b/math/s_logf.c
index 9399350..7d98b2b 100644
--- a/math/s_logf.c
+++ b/math/s_logf.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_logf.c"
diff --git a/math/s_pow.c b/math/s_pow.c
index 2e34c9f..6eca2b2 100644
--- a/math/s_pow.c
+++ b/math/s_pow.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_pow.c"
diff --git a/math/s_powf.c b/math/s_powf.c
index 6d91a4a..1d55d90 100644
--- a/math/s_powf.c
+++ b/math/s_powf.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_powf.c"
diff --git a/math/s_sin.c b/math/s_sin.c
index 06982c2..0c61712 100644
--- a/math/s_sin.c
+++ b/math/s_sin.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_sin.c"
diff --git a/math/s_sinf.c b/math/s_sinf.c
index 68ca908..3aae611 100644
--- a/math/s_sinf.c
+++ b/math/s_sinf.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #define SCALAR 1
 #include "v_sinf.c"
diff --git a/math/sincosf.c b/math/sincosf.c
index 9746f1c..446f21d 100644
--- a/math/sincosf.c
+++ b/math/sincosf.c
@@ -1,8 +1,8 @@
 /*
  * Single-precision sin/cos function.
  *
- * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
@@ -22,7 +22,7 @@
   int n;
   const sincos_t *p = &__sincosf_table[0];
 
-  if (abstop12 (y) < abstop12 (pio4))
+  if (abstop12 (y) < abstop12 (pio4f))
     {
       double x2 = x * x;
 
diff --git a/math/sincosf.h b/math/sincosf.h
index 1e80fc9..ec23ed7 100644
--- a/math/sincosf.h
+++ b/math/sincosf.h
@@ -1,8 +1,8 @@
 /*
  * Header for sinf, cosf and sincosf.
  *
- * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
@@ -12,7 +12,7 @@
 /* 2PI * 2^-64.  */
 static const double pi63 = 0x1.921FB54442D18p-62;
 /* PI / 4.  */
-static const double pio4 = 0x1.921FB54442D18p-1;
+static const float pio4f = 0x1.921FB6p-1f;
 
 /* The constants and polynomials for sine and cosine.  */
 typedef struct
diff --git a/math/sincosf_data.c b/math/sincosf_data.c
index ab4ac47..2252529 100644
--- a/math/sincosf_data.c
+++ b/math/sincosf_data.c
@@ -2,7 +2,7 @@
  * Data definition for sinf, cosf and sincosf.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/math/sinf.c b/math/sinf.c
index ddbc1da..8dd8ae4 100644
--- a/math/sinf.c
+++ b/math/sinf.c
@@ -1,8 +1,8 @@
 /*
  * Single-precision sin function.
  *
- * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
@@ -21,7 +21,7 @@
   int n;
   const sincos_t *p = &__sincosf_table[0];
 
-  if (abstop12 (y) < abstop12 (pio4))
+  if (abstop12 (y) < abstop12 (pio4f))
     {
       s = x * x;
 
diff --git a/math/test/mathbench.c b/math/test/mathbench.c
index 0c17826..6e18e36 100644
--- a/math/test/mathbench.c
+++ b/math/test/mathbench.c
@@ -1,8 +1,8 @@
 /*
  * Microbenchmark for math functions.
  *
- * Copyright (c) 2018-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #undef _GNU_SOURCE
@@ -66,6 +66,43 @@
 {
   return (v_float){x, x, x, x};
 }
+#if WANT_SVE_MATH
+#include <arm_sve.h>
+typedef svbool_t sv_bool;
+typedef svfloat64_t sv_double;
+
+#define sv_double_len() svcntd()
+
+static inline sv_double
+sv_double_load (const double *p)
+{
+  svbool_t pg = svptrue_b64();
+  return svld1(pg, p);
+}
+
+static inline sv_double
+sv_double_dup (double x)
+{
+  return svdup_n_f64(x);
+}
+
+typedef svfloat32_t sv_float;
+
+#define sv_float_len() svcntw()
+
+static inline sv_float
+sv_float_load (const float *p)
+{
+  svbool_t pg = svptrue_b32();
+  return svld1(pg, p);
+}
+
+static inline sv_float
+sv_float_dup (float x)
+{
+  return svdup_n_f32(x);
+}
+#endif
 #else
 /* dummy definitions to make things compile.  */
 typedef double v_double;
@@ -89,7 +126,6 @@
 {
   return x;
 }
-
 #if WANT_VMATH
 #if __aarch64__
 static v_double
@@ -116,101 +152,25 @@
 {
   return x;
 }
-
-__vpcs static v_float
-xy__vn_powf (v_float x)
+#endif
+#if WANT_SVE_MATH
+static sv_double
+__sv_dummy (sv_double x, sv_bool pg)
 {
-  return __vn_powf (x, x);
+  return x;
 }
 
-__vpcs static v_float
-xy_Z_powf (v_float x)
+static sv_float
+__sv_dummyf (sv_float x, sv_bool pg)
 {
-  return _ZGVnN4vv_powf (x, x);
+  return x;
 }
 
-__vpcs static v_double
-xy__vn_pow (v_double x)
-{
-  return __vn_pow (x, x);
-}
-
-__vpcs static v_double
-xy_Z_pow (v_double x)
-{
-  return _ZGVnN2vv_pow (x, x);
-}
+#endif
+#endif
 #endif
 
-static v_float
-xy__v_powf (v_float x)
-{
-  return __v_powf (x, x);
-}
-
-static v_double
-xy__v_pow (v_double x)
-{
-  return __v_pow (x, x);
-}
-#endif
-
-static float
-xy__s_powf (float x)
-{
-  return __s_powf (x, x);
-}
-
-static double
-xy__s_pow (double x)
-{
-  return __s_pow (x, x);
-}
-#endif
-
-static double
-xypow (double x)
-{
-  return pow (x, x);
-}
-
-static float
-xypowf (float x)
-{
-  return powf (x, x);
-}
-
-static double
-xpow (double x)
-{
-  return pow (x, 23.4);
-}
-
-static float
-xpowf (float x)
-{
-  return powf (x, 23.4f);
-}
-
-static double
-ypow (double x)
-{
-  return pow (2.34, x);
-}
-
-static float
-ypowf (float x)
-{
-  return powf (2.34f, x);
-}
-
-static float
-sincosf_wrap (float x)
-{
-  float s, c;
-  sincosf (x, &s, &c);
-  return s + c;
-}
+#include "test/mathbench_wrappers.h"
 
 static const struct fun
 {
@@ -229,6 +189,10 @@
     __vpcs v_double (*vnd) (v_double);
     __vpcs v_float (*vnf) (v_float);
 #endif
+#if WANT_SVE_MATH
+    sv_double (*svd) (sv_double, sv_bool);
+    sv_float (*svf) (sv_float, sv_bool);
+#endif
   } fun;
 } funtab[] = {
 #define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}},
@@ -237,106 +201,25 @@
 #define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}},
 #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},
 #define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}},
+#define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}},
+#define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}},
 D (dummy, 1.0, 2.0)
-D (exp, -9.9, 9.9)
-D (exp, 0.5, 1.0)
-D (exp2, -9.9, 9.9)
-D (log, 0.01, 11.1)
-D (log, 0.999, 1.001)
-D (log2, 0.01, 11.1)
-D (log2, 0.999, 1.001)
-{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}},
-D (xpow, 0.01, 11.1)
-D (ypow, -9.9, 9.9)
-D (erf, -6.0, 6.0)
-
 F (dummyf, 1.0, 2.0)
-F (expf, -9.9, 9.9)
-F (exp2f, -9.9, 9.9)
-F (logf, 0.01, 11.1)
-F (log2f, 0.01, 11.1)
-{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}},
-F (xpowf, 0.01, 11.1)
-F (ypowf, -9.9, 9.9)
-{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}},
-F (sinf, 0.1, 0.7)
-F (sinf, 0.8, 3.1)
-F (sinf, -3.1, 3.1)
-F (sinf, 3.3, 33.3)
-F (sinf, 100, 1000)
-F (sinf, 1e6, 1e32)
-F (cosf, 0.1, 0.7)
-F (cosf, 0.8, 3.1)
-F (cosf, -3.1, 3.1)
-F (cosf, 3.3, 33.3)
-F (cosf, 100, 1000)
-F (cosf, 1e6, 1e32)
-F (erff, -4.0, 4.0)
 #if WANT_VMATH
-D (__s_sin, -3.1, 3.1)
-D (__s_cos, -3.1, 3.1)
-D (__s_exp, -9.9, 9.9)
-D (__s_log, 0.01, 11.1)
-{"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}},
-F (__s_expf, -9.9, 9.9)
-F (__s_expf_1u, -9.9, 9.9)
-F (__s_exp2f, -9.9, 9.9)
-F (__s_exp2f_1u, -9.9, 9.9)
-F (__s_logf, 0.01, 11.1)
-{"__s_powf", 'f', 0, 0.01, 11.1, {.f = xy__s_powf}},
-F (__s_sinf, -3.1, 3.1)
-F (__s_cosf, -3.1, 3.1)
 #if __aarch64__
 VD (__v_dummy, 1.0, 2.0)
-VD (__v_sin, -3.1, 3.1)
-VD (__v_cos, -3.1, 3.1)
-VD (__v_exp, -9.9, 9.9)
-VD (__v_log, 0.01, 11.1)
-{"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}},
 VF (__v_dummyf, 1.0, 2.0)
-VF (__v_expf, -9.9, 9.9)
-VF (__v_expf_1u, -9.9, 9.9)
-VF (__v_exp2f, -9.9, 9.9)
-VF (__v_exp2f_1u, -9.9, 9.9)
-VF (__v_logf, 0.01, 11.1)
-{"__v_powf", 'f', 'v', 0.01, 11.1, {.vf = xy__v_powf}},
-VF (__v_sinf, -3.1, 3.1)
-VF (__v_cosf, -3.1, 3.1)
 #ifdef __vpcs
 VND (__vn_dummy, 1.0, 2.0)
-VND (__vn_exp, -9.9, 9.9)
-VND (_ZGVnN2v_exp, -9.9, 9.9)
-VND (__vn_log, 0.01, 11.1)
-VND (_ZGVnN2v_log, 0.01, 11.1)
-{"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}},
-{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}},
-VND (__vn_sin, -3.1, 3.1)
-VND (_ZGVnN2v_sin, -3.1, 3.1)
-VND (__vn_cos, -3.1, 3.1)
-VND (_ZGVnN2v_cos, -3.1, 3.1)
 VNF (__vn_dummyf, 1.0, 2.0)
-VNF (__vn_expf, -9.9, 9.9)
-VNF (_ZGVnN4v_expf, -9.9, 9.9)
-VNF (__vn_expf_1u, -9.9, 9.9)
-VNF (__vn_exp2f, -9.9, 9.9)
-VNF (_ZGVnN4v_exp2f, -9.9, 9.9)
-VNF (__vn_exp2f_1u, -9.9, 9.9)
-VNF (__vn_logf, 0.01, 11.1)
-VNF (_ZGVnN4v_logf, 0.01, 11.1)
-{"__vn_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy__vn_powf}},
-{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}},
-VNF (__vn_sinf, -3.1, 3.1)
-VNF (_ZGVnN4v_sinf, -3.1, 3.1)
-VNF (__vn_cosf, -3.1, 3.1)
-VNF (_ZGVnN4v_cosf, -3.1, 3.1)
+#endif
+#if WANT_SVE_MATH
+SVD (__sv_dummy, 1.0, 2.0)
+SVF (__sv_dummyf, 1.0, 2.0)
 #endif
 #endif
 #endif
+#include "test/mathbench_funcs.h"
 {0},
 #undef F
 #undef D
@@ -344,6 +227,8 @@
 #undef VD
 #undef VNF
 #undef VND
+#undef SVF
+#undef SVD
 };
 
 static void
@@ -508,6 +393,40 @@
 }
 #endif
 
+#if WANT_SVE_MATH
+static void
+run_sv_thruput (sv_double f (sv_double, sv_bool))
+{
+  for (int i = 0; i < N; i += sv_double_len ())
+    f (sv_double_load (A+i), svptrue_b64 ());
+}
+
+static void
+runf_sv_thruput (sv_float f (sv_float, sv_bool))
+{
+  for (int i = 0; i < N; i += sv_float_len ())
+    f (sv_float_load (Af+i), svptrue_b32 ());
+}
+
+static void
+run_sv_latency (sv_double f (sv_double, sv_bool))
+{
+  sv_double z = sv_double_dup (zero);
+  sv_double prev = z;
+  for (int i = 0; i < N; i += sv_double_len ())
+    prev = f (svmad_f64_x (svptrue_b64 (), prev, z, sv_double_load (A+i)), svptrue_b64 ());
+}
+
+static void
+runf_sv_latency (sv_float f (sv_float, sv_bool))
+{
+  sv_float z = sv_float_dup (zero);
+  sv_float prev = z;
+  for (int i = 0; i < N; i += sv_float_len ())
+    prev = f (svmad_f32_x (svptrue_b32 (), prev, z, sv_float_load (Af+i)), svptrue_b32 ());
+}
+#endif
+
 static uint64_t
 tic (void)
 {
@@ -570,6 +489,16 @@
   else if (f->prec == 'f' && type == 'l' && f->vec == 'n')
     TIMEIT (runf_vn_latency, f->fun.vnf);
 #endif
+#if WANT_SVE_MATH
+  else if (f->prec == 'd' && type == 't' && f->vec == 's')
+    TIMEIT (run_sv_thruput, f->fun.svd);
+  else if (f->prec == 'd' && type == 'l' && f->vec == 's')
+    TIMEIT (run_sv_latency, f->fun.svd);
+  else if (f->prec == 'f' && type == 't' && f->vec == 's')
+    TIMEIT (runf_sv_thruput, f->fun.svf);
+  else if (f->prec == 'f' && type == 'l' && f->vec == 's')
+    TIMEIT (runf_sv_latency, f->fun.svf);
+#endif
 
   if (type == 't')
     {
diff --git a/math/test/mathbench_funcs.h b/math/test/mathbench_funcs.h
new file mode 100644
index 0000000..ad6dd2a
--- /dev/null
+++ b/math/test/mathbench_funcs.h
@@ -0,0 +1,100 @@
+/*
+ * Function entries for mathbench.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+D (exp, -9.9, 9.9)
+D (exp, 0.5, 1.0)
+D (exp2, -9.9, 9.9)
+D (log, 0.01, 11.1)
+D (log, 0.999, 1.001)
+D (log2, 0.01, 11.1)
+D (log2, 0.999, 1.001)
+{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}},
+D (xpow, 0.01, 11.1)
+D (ypow, -9.9, 9.9)
+D (erf, -6.0, 6.0)
+
+F (expf, -9.9, 9.9)
+F (exp2f, -9.9, 9.9)
+F (logf, 0.01, 11.1)
+F (log2f, 0.01, 11.1)
+{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}},
+F (xpowf, 0.01, 11.1)
+F (ypowf, -9.9, 9.9)
+{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}},
+F (sinf, 0.1, 0.7)
+F (sinf, 0.8, 3.1)
+F (sinf, -3.1, 3.1)
+F (sinf, 3.3, 33.3)
+F (sinf, 100, 1000)
+F (sinf, 1e6, 1e32)
+F (cosf, 0.1, 0.7)
+F (cosf, 0.8, 3.1)
+F (cosf, -3.1, 3.1)
+F (cosf, 3.3, 33.3)
+F (cosf, 100, 1000)
+F (cosf, 1e6, 1e32)
+F (erff, -4.0, 4.0)
+#if WANT_VMATH
+D (__s_sin, -3.1, 3.1)
+D (__s_cos, -3.1, 3.1)
+D (__s_exp, -9.9, 9.9)
+D (__s_log, 0.01, 11.1)
+{"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}},
+F (__s_expf, -9.9, 9.9)
+F (__s_expf_1u, -9.9, 9.9)
+F (__s_exp2f, -9.9, 9.9)
+F (__s_exp2f_1u, -9.9, 9.9)
+F (__s_logf, 0.01, 11.1)
+{"__s_powf", 'f', 0, 0.01, 11.1, {.f = xy__s_powf}},
+F (__s_sinf, -3.1, 3.1)
+F (__s_cosf, -3.1, 3.1)
+#if __aarch64__
+VD (__v_sin, -3.1, 3.1)
+VD (__v_cos, -3.1, 3.1)
+VD (__v_exp, -9.9, 9.9)
+VD (__v_log, 0.01, 11.1)
+{"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}},
+VF (__v_expf, -9.9, 9.9)
+VF (__v_expf_1u, -9.9, 9.9)
+VF (__v_exp2f, -9.9, 9.9)
+VF (__v_exp2f_1u, -9.9, 9.9)
+VF (__v_logf, 0.01, 11.1)
+{"__v_powf", 'f', 'v', 0.01, 11.1, {.vf = xy__v_powf}},
+VF (__v_sinf, -3.1, 3.1)
+VF (__v_cosf, -3.1, 3.1)
+#ifdef __vpcs
+VND (__vn_exp, -9.9, 9.9)
+VND (_ZGVnN2v_exp, -9.9, 9.9)
+VND (__vn_log, 0.01, 11.1)
+VND (_ZGVnN2v_log, 0.01, 11.1)
+{"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}},
+{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}},
+VND (__vn_sin, -3.1, 3.1)
+VND (_ZGVnN2v_sin, -3.1, 3.1)
+VND (__vn_cos, -3.1, 3.1)
+VND (_ZGVnN2v_cos, -3.1, 3.1)
+VNF (__vn_expf, -9.9, 9.9)
+VNF (_ZGVnN4v_expf, -9.9, 9.9)
+VNF (__vn_expf_1u, -9.9, 9.9)
+VNF (__vn_exp2f, -9.9, 9.9)
+VNF (_ZGVnN4v_exp2f, -9.9, 9.9)
+VNF (__vn_exp2f_1u, -9.9, 9.9)
+VNF (__vn_logf, 0.01, 11.1)
+VNF (_ZGVnN4v_logf, 0.01, 11.1)
+{"__vn_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy__vn_powf}},
+{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}},
+VNF (__vn_sinf, -3.1, 3.1)
+VNF (_ZGVnN4v_sinf, -3.1, 3.1)
+VNF (__vn_cosf, -3.1, 3.1)
+VNF (_ZGVnN4v_cosf, -3.1, 3.1)
+#endif
+#endif
+#endif
diff --git a/math/test/mathbench_wrappers.h b/math/test/mathbench_wrappers.h
new file mode 100644
index 0000000..8311f0f
--- /dev/null
+++ b/math/test/mathbench_wrappers.h
@@ -0,0 +1,104 @@
+/*
+ * Function wrappers for mathbench.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#if WANT_VMATH
+#if __aarch64__
+
+#ifdef __vpcs
+__vpcs static v_float
+xy__vn_powf (v_float x)
+{
+  return __vn_powf (x, x);
+}
+
+__vpcs static v_float
+xy_Z_powf (v_float x)
+{
+  return _ZGVnN4vv_powf (x, x);
+}
+
+__vpcs static v_double
+xy__vn_pow (v_double x)
+{
+  return __vn_pow (x, x);
+}
+
+__vpcs static v_double
+xy_Z_pow (v_double x)
+{
+  return _ZGVnN2vv_pow (x, x);
+}
+#endif // __vpcs
+
+static v_float
+xy__v_powf (v_float x)
+{
+  return __v_powf (x, x);
+}
+
+static v_double
+xy__v_pow (v_double x)
+{
+  return __v_pow (x, x);
+}
+#endif // __aarch64__
+
+static float
+xy__s_powf (float x)
+{
+  return __s_powf (x, x);
+}
+
+static double
+xy__s_pow (double x)
+{
+  return __s_pow (x, x);
+}
+#endif // WANT_VMATH
+
+static double
+xypow (double x)
+{
+  return pow (x, x);
+}
+
+static float
+xypowf (float x)
+{
+  return powf (x, x);
+}
+
+static double
+xpow (double x)
+{
+  return pow (x, 23.4);
+}
+
+static float
+xpowf (float x)
+{
+  return powf (x, 23.4f);
+}
+
+static double
+ypow (double x)
+{
+  return pow (2.34, x);
+}
+
+static float
+ypowf (float x)
+{
+  return powf (2.34f, x);
+}
+
+static float
+sincosf_wrap (float x)
+{
+  float s, c;
+  sincosf (x, &s, &c);
+  return s + c;
+}
diff --git a/math/test/mathtest.c b/math/test/mathtest.c
index 3108967..3168da4 100644
--- a/math/test/mathtest.c
+++ b/math/test/mathtest.c
@@ -1,8 +1,8 @@
 /*
  * mathtest.c - test rig for mathlib
  *
- * Copyright (c) 1998-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 1998-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <assert.h>
@@ -196,9 +196,11 @@
 #define TFUNCARM(arg,ret,name,tolerance) { t_func, arg, ret, (void*)& ARM_PREFIX(name), m_none, tolerance, #name }
 #define MFUNC(arg,ret,name,tolerance) { t_macro, arg, ret, NULL, m_##name, tolerance, #name }
 
+#ifndef PL
 /* sincosf wrappers for easier testing.  */
 static float sincosf_sinf(float x) { float s,c; sincosf(x, &s, &c); return s; }
 static float sincosf_cosf(float x) { float s,c; sincosf(x, &s, &c); return c; }
+#endif
 
 test_func tfuncs[] = {
     /* trigonometric */
@@ -218,9 +220,10 @@
     TFUNCARM(at_s,rt_s, tanf, 4*ULPUNIT),
     TFUNCARM(at_s,rt_s, sinf, 3*ULPUNIT/4),
     TFUNCARM(at_s,rt_s, cosf, 3*ULPUNIT/4),
+#ifndef PL
     TFUNCARM(at_s,rt_s, sincosf_sinf, 3*ULPUNIT/4),
     TFUNCARM(at_s,rt_s, sincosf_cosf, 3*ULPUNIT/4),
-
+#endif
     /* hyperbolic */
     TFUNC(at_d, rt_d, atanh, 4*ULPUNIT),
     TFUNC(at_d, rt_d, asinh, 4*ULPUNIT),
diff --git a/math/test/rtest/dotest.c b/math/test/rtest/dotest.c
index 6be79e1..5b3e9b4 100644
--- a/math/test/rtest/dotest.c
+++ b/math/test/rtest/dotest.c
@@ -2,7 +2,7 @@
  * dotest.c - actually generate mathlib test cases
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdio.h>
diff --git a/math/test/rtest/intern.h b/math/test/rtest/intern.h
index 12a9c74..3ebd7dd 100644
--- a/math/test/rtest/intern.h
+++ b/math/test/rtest/intern.h
@@ -2,7 +2,7 @@
  * intern.h
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef mathtest_intern_h
diff --git a/math/test/rtest/main.c b/math/test/rtest/main.c
index 0d8ead8..3d533c9 100644
--- a/math/test/rtest/main.c
+++ b/math/test/rtest/main.c
@@ -2,7 +2,7 @@
  * main.c
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <assert.h>
diff --git a/math/test/rtest/random.c b/math/test/rtest/random.c
index 5612396..1de3258 100644
--- a/math/test/rtest/random.c
+++ b/math/test/rtest/random.c
@@ -2,7 +2,7 @@
  * random.c - random number generator for producing mathlib test cases
  *
  * Copyright (c) 1998-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "types.h"
diff --git a/math/test/rtest/random.h b/math/test/rtest/random.h
index b4b22df..0b477d7 100644
--- a/math/test/rtest/random.h
+++ b/math/test/rtest/random.h
@@ -2,7 +2,7 @@
  * random.h - header for random.c
  *
  * Copyright (c) 2009-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "types.h"
diff --git a/math/test/rtest/semi.c b/math/test/rtest/semi.c
index c9f0daf..70a7844 100644
--- a/math/test/rtest/semi.c
+++ b/math/test/rtest/semi.c
@@ -2,7 +2,7 @@
  * semi.c: test implementations of mathlib seminumerical functions
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdio.h>
diff --git a/math/test/rtest/semi.h b/math/test/rtest/semi.h
index 17dc415..7a1444e 100644
--- a/math/test/rtest/semi.h
+++ b/math/test/rtest/semi.h
@@ -2,7 +2,7 @@
  * semi.h: header for semi.c
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef test_semi_h
diff --git a/math/test/rtest/types.h b/math/test/rtest/types.h
index 53cd557..e15b4e0 100644
--- a/math/test/rtest/types.h
+++ b/math/test/rtest/types.h
@@ -2,7 +2,7 @@
  * types.h
  *
  * Copyright (c) 2005-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef mathtest_types_h
diff --git a/math/test/rtest/wrappers.c b/math/test/rtest/wrappers.c
index de45ac5..4410171 100644
--- a/math/test/rtest/wrappers.c
+++ b/math/test/rtest/wrappers.c
@@ -2,7 +2,7 @@
  * wrappers.c - wrappers to modify output of MPFR/MPC test functions
  *
  * Copyright (c) 2014-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <assert.h>
diff --git a/math/test/rtest/wrappers.h b/math/test/rtest/wrappers.h
index 7b09c85..0a8a587 100644
--- a/math/test/rtest/wrappers.h
+++ b/math/test/rtest/wrappers.h
@@ -2,7 +2,7 @@
  * wrappers.h - wrappers to modify output of MPFR/MPC test functions
  *
  * Copyright (c) 2014-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 typedef struct {
diff --git a/math/test/runulp.sh b/math/test/runulp.sh
index 0190d9a..b4000f6 100755
--- a/math/test/runulp.sh
+++ b/math/test/runulp.sh
@@ -2,8 +2,8 @@
 
 # ULP error check script.
 #
-# Copyright (c) 2019-2020, Arm Limited.
-# SPDX-License-Identifier: MIT
+# Copyright (c) 2019-2022, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 #set -x
 set -eu
@@ -145,7 +145,7 @@
 # vector functions
 Ldir=0.5
 r='n'
-flags="${ULPFLAGS:--q} -f"
+flags="${ULPFLAGS:--q}"
 runs=
 check __s_exp 1 && runs=1
 runv=
@@ -229,7 +229,7 @@
 L_cosf=1.4
 L_powf=2.1
 
-while read G F R
+while read G F R D
 do
 	[ "$R" = 1 ] || continue
 	case "$G" in \#*) continue ;; esac
@@ -239,7 +239,16 @@
 	do
 		[ -n "$X" ] || continue
 		case "$X" in \#*) continue ;; esac
-		t $F $X
+		disable_fenv=""
+		if [ -z "$WANT_SIMD_EXCEPT" ] || [ $WANT_SIMD_EXCEPT -eq 0 ]; then
+			# If library was built with SIMD exceptions
+			# disabled, disable fenv checking in ulp
+			# tool. Otherwise, fenv checking may still be
+			# disabled by adding -f to the end of the run
+			# line.
+			disable_fenv="-f"
+		fi
+		t $D $disable_fenv $F $X
 	done << EOF
 $range
 EOF
@@ -255,10 +264,10 @@
 log  __vn_log      $runvn
 log  _ZGVnN2v_log  $runvn
 
-pow __s_pow       $runs
-pow __v_pow       $runv
-pow __vn_pow      $runvn
-pow _ZGVnN2vv_pow $runvn
+pow __s_pow       $runs         -f
+pow __v_pow       $runv         -f
+pow __vn_pow      $runvn        -f
+pow _ZGVnN2vv_pow $runvn        -f
 
 sin __s_sin       $runs
 sin __v_sin       $runv
@@ -275,18 +284,18 @@
 expf __vn_expf     $runvn
 expf _ZGVnN4v_expf $runvn
 
-expf_1u __s_expf_1u   $runs
-expf_1u __v_expf_1u   $runv
-expf_1u __vn_expf_1u  $runvn
+expf_1u __s_expf_1u   $runs     -f
+expf_1u __v_expf_1u   $runv     -f
+expf_1u __vn_expf_1u  $runvn    -f
 
 exp2f __s_exp2f      $runs
 exp2f __v_exp2f      $runv
 exp2f __vn_exp2f     $runvn
 exp2f _ZGVnN4v_exp2f $runvn
 
-exp2f_1u __s_exp2f_1u  $runs
-exp2f_1u __v_exp2f_1u  $runv
-exp2f_1u __vn_exp2f_1u $runvn
+exp2f_1u __s_exp2f_1u  $runs    -f
+exp2f_1u __v_exp2f_1u  $runv    -f
+exp2f_1u __vn_exp2f_1u $runvn   -f
 
 logf __s_logf      $runs
 logf __v_logf      $runv
@@ -303,10 +312,10 @@
 cosf __vn_cosf     $runvn
 cosf _ZGVnN4v_cosf $runvn
 
-powf __s_powf       $runs
-powf __v_powf       $runv
-powf __vn_powf      $runvn
-powf _ZGVnN4vv_powf $runvn
+powf __s_powf       $runs       -f
+powf __v_powf       $runv       -f
+powf __vn_powf      $runvn      -f
+powf _ZGVnN4vv_powf $runvn      -f
 EOF
 
 [ 0 -eq $FAIL ] || {
diff --git a/math/test/testcases/directed/cosf.tst b/math/test/testcases/directed/cosf.tst
index 7916044..7ea0d45 100644
--- a/math/test/testcases/directed/cosf.tst
+++ b/math/test/testcases/directed/cosf.tst
@@ -1,7 +1,7 @@
 ; cosf.tst - Directed test cases for SP cosine
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=cosf op1=7fc00001 result=7fc00001 errno=0
 func=cosf op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/erf.tst b/math/test/testcases/directed/erf.tst
index 7fa4d18..12384ce 100644
--- a/math/test/testcases/directed/erf.tst
+++ b/math/test/testcases/directed/erf.tst
@@ -1,7 +1,7 @@
 ; erf.tst - Directed test cases for erf
 ;
 ; Copyright (c) 2007-2020, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=erf op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=erf op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/erff.tst b/math/test/testcases/directed/erff.tst
index d05b7b1..28f8fa3 100644
--- a/math/test/testcases/directed/erff.tst
+++ b/math/test/testcases/directed/erff.tst
@@ -1,7 +1,7 @@
 ; erff.tst
 ;
 ; Copyright (c) 2007-2020, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=erff op1=7fc00001 result=7fc00001 errno=0
 func=erff op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/exp.tst b/math/test/testcases/directed/exp.tst
index 85d556c..0bb2ef4 100644
--- a/math/test/testcases/directed/exp.tst
+++ b/math/test/testcases/directed/exp.tst
@@ -1,7 +1,7 @@
 ; Directed test cases for exp
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=exp op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=exp op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/exp2.tst b/math/test/testcases/directed/exp2.tst
index fa56c9f..7069f90 100644
--- a/math/test/testcases/directed/exp2.tst
+++ b/math/test/testcases/directed/exp2.tst
@@ -1,7 +1,7 @@
 ; Directed test cases for exp2
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=exp2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=exp2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/exp2f.tst b/math/test/testcases/directed/exp2f.tst
index 38cfc3f..6ca2eea 100644
--- a/math/test/testcases/directed/exp2f.tst
+++ b/math/test/testcases/directed/exp2f.tst
@@ -1,7 +1,7 @@
 ; exp2f.tst - Directed test cases for exp2f
 ;
 ; Copyright (c) 2017-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=exp2f op1=7fc00001 result=7fc00001 errno=0
 func=exp2f op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/expf.tst b/math/test/testcases/directed/expf.tst
index ff0f671..89ae8fe 100644
--- a/math/test/testcases/directed/expf.tst
+++ b/math/test/testcases/directed/expf.tst
@@ -1,7 +1,7 @@
 ; expf.tst - Directed test cases for expf
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=expf op1=7fc00001 result=7fc00001 errno=0
 func=expf op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/log.tst b/math/test/testcases/directed/log.tst
index a0aa398..686ea83 100644
--- a/math/test/testcases/directed/log.tst
+++ b/math/test/testcases/directed/log.tst
@@ -1,7 +1,7 @@
 ; Directed test cases for log
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=log op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=log op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/log2.tst b/math/test/testcases/directed/log2.tst
index ff1286c..361bdde 100644
--- a/math/test/testcases/directed/log2.tst
+++ b/math/test/testcases/directed/log2.tst
@@ -1,7 +1,7 @@
 ; Directed test cases for log2
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=log2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=log2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/log2f.tst b/math/test/testcases/directed/log2f.tst
index 5832c4f..5fce051 100644
--- a/math/test/testcases/directed/log2f.tst
+++ b/math/test/testcases/directed/log2f.tst
@@ -1,7 +1,7 @@
 ; log2f.tst - Directed test cases for log2f
 ;
 ; Copyright (c) 2017-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=log2f op1=7fc00001 result=7fc00001 errno=0
 func=log2f op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/logf.tst b/math/test/testcases/directed/logf.tst
index 6e68a36..a6d1b9d 100644
--- a/math/test/testcases/directed/logf.tst
+++ b/math/test/testcases/directed/logf.tst
@@ -1,7 +1,7 @@
 ; logf.tst - Directed test cases for logf
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=logf op1=7fc00001 result=7fc00001 errno=0
 func=logf op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/pow.tst b/math/test/testcases/directed/pow.tst
index 1966581..879d128 100644
--- a/math/test/testcases/directed/pow.tst
+++ b/math/test/testcases/directed/pow.tst
@@ -1,7 +1,7 @@
 ; Directed test cases for pow
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=pow op1=00000000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=00000000.00000001 result=00000000.00000000 errno=0
diff --git a/math/test/testcases/directed/powf.tst b/math/test/testcases/directed/powf.tst
index 3fa8b11..46d5224 100644
--- a/math/test/testcases/directed/powf.tst
+++ b/math/test/testcases/directed/powf.tst
@@ -1,7 +1,7 @@
 ; powf.tst - Directed test cases for powf
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=powf op1=7f800001 op2=7f800001 result=7fc00001 errno=0 status=i
 func=powf op1=7f800001 op2=ff800001 result=7fc00001 errno=0 status=i
diff --git a/math/test/testcases/directed/sincosf.tst b/math/test/testcases/directed/sincosf.tst
index 4b33d22..cddb346 100644
--- a/math/test/testcases/directed/sincosf.tst
+++ b/math/test/testcases/directed/sincosf.tst
@@ -1,7 +1,7 @@
 ; Directed test cases for SP sincos
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 
 func=sincosf_sinf op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/sinf.tst b/math/test/testcases/directed/sinf.tst
index ded80b1..041b13d 100644
--- a/math/test/testcases/directed/sinf.tst
+++ b/math/test/testcases/directed/sinf.tst
@@ -1,7 +1,7 @@
 ; sinf.tst - Directed test cases for SP sine
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 
 func=sinf op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/random/double.tst b/math/test/testcases/random/double.tst
index c24ff80..8e885d6 100644
--- a/math/test/testcases/random/double.tst
+++ b/math/test/testcases/random/double.tst
@@ -1,7 +1,7 @@
 !! double.tst - Random test case specification for DP functions
 !!
 !! Copyright (c) 1999-2019, Arm Limited.
-!! SPDX-License-Identifier: MIT
+!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 test exp 10000
 test exp2 10000
diff --git a/math/test/testcases/random/float.tst b/math/test/testcases/random/float.tst
index d02a227..ea4a5a0 100644
--- a/math/test/testcases/random/float.tst
+++ b/math/test/testcases/random/float.tst
@@ -1,7 +1,7 @@
 !! single.tst - Random test case specification for SP functions
 !!
 !! Copyright (c) 1999-2019, Arm Limited.
-!! SPDX-License-Identifier: MIT
+!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 test sinf 10000
 test cosf 10000
diff --git a/math/test/ulp.c b/math/test/ulp.c
index 51479b8..bb8c3ad 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -1,8 +1,8 @@
 /*
  * ULP error checking tool for math functions.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <ctype.h>
@@ -214,16 +214,6 @@
   double errlim;
 };
 
-/* Wrappers for sincos.  */
-static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);}
-static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);}
-static double sincos_sin(double x) {(void)cos(x); return sin(x);}
-static double sincos_cos(double x) {(void)sin(x); return cos(x);}
-#if USE_MPFR
-static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); }
-static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); }
-#endif
-
 /* A bit of a hack: call vector functions twice with the same
    input in lane 0 but a different value in other lanes: once
    with an in-range value and then with a special case value.  */
@@ -233,52 +223,81 @@
 #if __aarch64__ && WANT_VMATH
 typedef __f32x4_t v_float;
 typedef __f64x2_t v_double;
-static const float fv[2] = {1.0f, -INFINITY};
-static const double dv[2] = {1.0, -INFINITY};
+/* First element of fv and dv may be changed by -c argument.  */
+static float fv[2] = {1.0f, -INFINITY};
+static double dv[2] = {1.0, -INFINITY};
 static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; }
 static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; }
+#if WANT_SVE_MATH
+#include <arm_sve.h>
+typedef __SVFloat32_t sv_float;
+typedef __SVFloat64_t sv_double;
 
-static float v_sinf(float x) { return __v_sinf(argf(x))[0]; }
-static float v_cosf(float x) { return __v_cosf(argf(x))[0]; }
-static float v_expf_1u(float x) { return __v_expf_1u(argf(x))[0]; }
-static float v_expf(float x) { return __v_expf(argf(x))[0]; }
-static float v_exp2f_1u(float x) { return __v_exp2f_1u(argf(x))[0]; }
-static float v_exp2f(float x) { return __v_exp2f(argf(x))[0]; }
-static float v_logf(float x) { return __v_logf(argf(x))[0]; }
-static float v_powf(float x, float y) { return __v_powf(argf(x),argf(y))[0]; }
-static double v_sin(double x) { return __v_sin(argd(x))[0]; }
-static double v_cos(double x) { return __v_cos(argd(x))[0]; }
-static double v_exp(double x) { return __v_exp(argd(x))[0]; }
-static double v_log(double x) { return __v_log(argd(x))[0]; }
-static double v_pow(double x, double y) { return __v_pow(argd(x),argd(y))[0]; }
-#ifdef __vpcs
-static float vn_sinf(float x) { return __vn_sinf(argf(x))[0]; }
-static float vn_cosf(float x) { return __vn_cosf(argf(x))[0]; }
-static float vn_expf_1u(float x) { return __vn_expf_1u(argf(x))[0]; }
-static float vn_expf(float x) { return __vn_expf(argf(x))[0]; }
-static float vn_exp2f_1u(float x) { return __vn_exp2f_1u(argf(x))[0]; }
-static float vn_exp2f(float x) { return __vn_exp2f(argf(x))[0]; }
-static float vn_logf(float x) { return __vn_logf(argf(x))[0]; }
-static float vn_powf(float x, float y) { return __vn_powf(argf(x),argf(y))[0]; }
-static double vn_sin(double x) { return __vn_sin(argd(x))[0]; }
-static double vn_cos(double x) { return __vn_cos(argd(x))[0]; }
-static double vn_exp(double x) { return __vn_exp(argd(x))[0]; }
-static double vn_log(double x) { return __vn_log(argd(x))[0]; }
-static double vn_pow(double x, double y) { return __vn_pow(argd(x),argd(y))[0]; }
-static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; }
-static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; }
-static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; }
-static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; }
-static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; }
-static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; }
-static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; }
-static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; }
-static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; }
-static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; }
-static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; }
+static inline sv_float svargf(float x)  {
+	int n = svcntw();
+	float base[n];
+	for (int i=0; i<n; i++)
+		base[i] = (float)x;
+	base[n-1] = (float) fv[secondcall];
+	return svld1(svptrue_b32(), base);
+}
+static inline sv_double svargd(double x) {
+	int n = svcntd();
+	double base[n];
+	for (int i=0; i<n; i++)
+		base[i] = x;
+	base[n-1] = dv[secondcall];
+	return svld1(svptrue_b64(), base);
+}
+static inline float svretf(sv_float vec)  {
+	int n = svcntw();
+	float res[n];
+	svst1(svptrue_b32(), res, vec);
+	return res[0];
+}
+static inline double svretd(sv_double vec) {
+	int n = svcntd();
+	double res[n];
+	svst1(svptrue_b64(), res, vec);
+	return res[0];
+}
 #endif
 #endif
 
+#if WANT_SVE_MATH
+long double
+dummyl (long double x)
+{
+  return x;
+}
+
+double
+dummy (double x)
+{
+  return x;
+}
+
+static sv_double
+__sv_dummy (sv_double x)
+{
+  return x;
+}
+
+static sv_float
+__sv_dummyf (sv_float x)
+{
+  return x;
+}
+#endif
+
+#include "test/ulp_wrappers.h"
+
+/* Wrappers for SVE functions.  */
+#if WANT_SVE_MATH
+static double sv_dummy (double x) { return svretd (__sv_dummy (svargd (x))); }
+static float sv_dummyf (float x) { return svretf (__sv_dummyf (svargf (x))); }
+#endif
+
 struct fun
 {
   const char *name;
@@ -322,83 +341,53 @@
 #define F2(x) F (x##f, x##f, x, mpfr_##x, 2, 1, f2, 0)
 #define D1(x) F (x, x, x##l, mpfr_##x, 1, 0, d1, 0)
 #define D2(x) F (x, x, x##l, mpfr_##x, 2, 0, d2, 0)
- F1 (sin)
- F1 (cos)
- F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0)
- F (sincosf_cosf, sincosf_cosf, sincos_cos, sincos_mpfr_cos, 1, 1, f1, 0)
- F1 (exp)
- F1 (exp2)
- F1 (log)
- F1 (log2)
- F2 (pow)
- F1 (erf)
- D1 (exp)
- D1 (exp2)
- D1 (log)
- D1 (log2)
- D2 (pow)
- D1 (erf)
-#if WANT_VMATH
- F (__s_sinf, __s_sinf, sin, mpfr_sin, 1, 1, f1, 0)
- F (__s_cosf, __s_cosf, cos, mpfr_cos, 1, 1, f1, 0)
- F (__s_expf_1u, __s_expf_1u, exp, mpfr_exp, 1, 1, f1, 0)
- F (__s_expf, __s_expf, exp, mpfr_exp, 1, 1, f1, 0)
- F (__s_exp2f_1u, __s_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 0)
- F (__s_exp2f, __s_exp2f, exp2, mpfr_exp2, 1, 1, f1, 0)
- F (__s_powf, __s_powf, pow, mpfr_pow, 2, 1, f2, 0)
- F (__s_logf, __s_logf, log, mpfr_log, 1, 1, f1, 0)
- F (__s_sin, __s_sin, sinl, mpfr_sin, 1, 0, d1, 0)
- F (__s_cos, __s_cos, cosl, mpfr_cos, 1, 0, d1, 0)
- F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0)
- F (__s_log, __s_log, logl, mpfr_log, 1, 0, d1, 0)
- F (__s_pow, __s_pow, powl, mpfr_pow, 2, 0, d2, 0)
-#if __aarch64__
- F (__v_sinf, v_sinf, sin, mpfr_sin, 1, 1, f1, 1)
- F (__v_cosf, v_cosf, cos, mpfr_cos, 1, 1, f1, 1)
- F (__v_expf_1u, v_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
- F (__v_expf, v_expf, exp, mpfr_exp, 1, 1, f1, 1)
- F (__v_exp2f_1u, v_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (__v_exp2f, v_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (__v_logf, v_logf, log, mpfr_log, 1, 1, f1, 1)
- F (__v_powf, v_powf, pow, mpfr_pow, 2, 1, f2, 1)
- F (__v_sin, v_sin, sinl, mpfr_sin, 1, 0, d1, 1)
- F (__v_cos, v_cos, cosl, mpfr_cos, 1, 0, d1, 1)
- F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1)
- F (__v_log, v_log, logl, mpfr_log, 1, 0, d1, 1)
- F (__v_pow, v_pow, powl, mpfr_pow, 2, 0, d2, 1)
-#ifdef __vpcs
- F (__vn_sinf, vn_sinf, sin, mpfr_sin, 1, 1, f1, 1)
- F (__vn_cosf, vn_cosf, cos, mpfr_cos, 1, 1, f1, 1)
- F (__vn_expf_1u, vn_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
- F (__vn_expf, vn_expf, exp, mpfr_exp, 1, 1, f1, 1)
- F (__vn_exp2f_1u, vn_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (__vn_exp2f, vn_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (__vn_logf, vn_logf, log, mpfr_log, 1, 1, f1, 1)
- F (__vn_powf, vn_powf, pow, mpfr_pow, 2, 1, f2, 1)
- F (__vn_sin, vn_sin, sinl, mpfr_sin, 1, 0, d1, 1)
- F (__vn_cos, vn_cos, cosl, mpfr_cos, 1, 0, d1, 1)
- F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1)
- F (__vn_log, vn_log, logl, mpfr_log, 1, 0, d1, 1)
- F (__vn_pow, vn_pow, powl, mpfr_pow, 2, 0, d2, 1)
- F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1)
- F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1)
- F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1)
- F (_ZGVnN4v_exp2f, Z_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1)
- F (_ZGVnN4vv_powf, Z_powf, pow, mpfr_pow, 2, 1, f2, 1)
- F (_ZGVnN2v_sin, Z_sin, sinl, mpfr_sin, 1, 0, d1, 1)
- F (_ZGVnN2v_cos, Z_cos, cosl, mpfr_cos, 1, 0, d1, 1)
- F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1)
- F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1)
- F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1)
+/* Neon routines.  */
+#define VF1(x) F (__v_##x##f, v_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define VF2(x) F (__v_##x##f, v_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define VD1(x) F (__v_##x, v_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define VD2(x) F (__v_##x, v_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+#define VNF1(x) F (__vn_##x##f, vn_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define VNF2(x) F (__vn_##x##f, vn_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define VND1(x) F (__vn_##x, vn_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define VND2(x) F (__vn_##x, vn_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+#define ZVF1(x) F (_ZGVnN4v_##x##f, Z_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define ZVF2(x) F (_ZGVnN4vv_##x##f, Z_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define ZVD1(x) F (_ZGVnN2v_##x, Z_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define ZVD2(x) F (_ZGVnN2vv_##x, Z_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+#define ZVNF1(x) VNF1 (x) ZVF1 (x)
+#define ZVNF2(x) VNF2 (x) ZVF2 (x)
+#define ZVND1(x) VND1 (x) ZVD1 (x)
+#define ZVND2(x) VND2 (x) ZVD2 (x)
+#define SF1(x) F (__s_##x##f, __s_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define SF2(x) F (__s_##x##f, __s_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define SD1(x) F (__s_##x, __s_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define SD2(x) F (__s_##x, __s_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+/* SVE routines.  */
+#define SVF1(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define SVF2(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define SVD1(x) F (__sv_##x, sv_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define SVD2(x) F (__sv_##x, sv_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+#define ZSVF1(x) F (_ZGVsMxv_##x##f, Z_sv_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define ZSVF2(x) F (_ZGVsMxvv_##x##f, Z_sv_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define ZSVD1(x) F (_ZGVsMxv_##x, Z_sv_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define ZSVD2(x) F (_ZGVsMxvv_##x, Z_sv_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+
+#include "test/ulp_funcs.h"
+
+#if WANT_SVE_MATH
+ SVD1 (dummy)
+ SVF1 (dummy)
 #endif
-#endif
-#endif
+
 #undef F
 #undef F1
 #undef F2
 #undef D1
 #undef D2
+#undef SVF1
+#undef SVF2
+#undef SVD1
+#undef SVD2
  {0}};
 
 /* Boilerplate for generic calls.  */
@@ -645,6 +634,11 @@
   puts ("-q: quiet.");
   puts ("-m: use mpfr even if faster method is available.");
   puts ("-f: disable fenv testing (rounding modes and exceptions).");
+#if __aarch64__ && WANT_VMATH
+  puts ("-c: neutral 'control value' to test behaviour when one lane can affect another. \n"
+	"    This should be different from tested input in other lanes, and non-special \n"
+	"    (i.e. should not trigger fenv exceptions). Default is 1.");
+#endif
   puts ("Supported func:");
   for (const struct fun *f = fun; f->name; f++)
     printf ("\t%s\n", f->name);
@@ -812,6 +806,14 @@
 	      conf.rc = argv[0][0];
 	    }
 	  break;
+#if __aarch64__ && WANT_VMATH
+	case 'c':
+	  argc--;
+	  argv++;
+	  fv[0] = strtof(argv[0], 0);
+	  dv[0] = strtod(argv[0], 0);
+	  break;
+#endif
 	default:
 	  usage ();
 	}
diff --git a/math/test/ulp.h b/math/test/ulp.h
index a0c3016..327b4bd 100644
--- a/math/test/ulp.h
+++ b/math/test/ulp.h
@@ -2,7 +2,7 @@
  * Generic functions for ULP error estimation.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* For each different math function type,
diff --git a/math/test/ulp_funcs.h b/math/test/ulp_funcs.h
new file mode 100644
index 0000000..f5cea4d
--- /dev/null
+++ b/math/test/ulp_funcs.h
@@ -0,0 +1,78 @@
+/*
+ * Function entries for ulp.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+ F1 (sin)
+ F1 (cos)
+ F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0)
+ F (sincosf_cosf, sincosf_cosf, sincos_cos, sincos_mpfr_cos, 1, 1, f1, 0)
+ F1 (exp)
+ F1 (exp2)
+ F1 (log)
+ F1 (log2)
+ F2 (pow)
+ F1 (erf)
+ D1 (exp)
+ D1 (exp2)
+ D1 (log)
+ D1 (log2)
+ D2 (pow)
+ D1 (erf)
+#if WANT_VMATH
+ F (__s_sinf, __s_sinf, sin, mpfr_sin, 1, 1, f1, 0)
+ F (__s_cosf, __s_cosf, cos, mpfr_cos, 1, 1, f1, 0)
+ F (__s_expf_1u, __s_expf_1u, exp, mpfr_exp, 1, 1, f1, 0)
+ F (__s_expf, __s_expf, exp, mpfr_exp, 1, 1, f1, 0)
+ F (__s_exp2f_1u, __s_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 0)
+ F (__s_exp2f, __s_exp2f, exp2, mpfr_exp2, 1, 1, f1, 0)
+ F (__s_powf, __s_powf, pow, mpfr_pow, 2, 1, f2, 0)
+ F (__s_logf, __s_logf, log, mpfr_log, 1, 1, f1, 0)
+ F (__s_sin, __s_sin, sinl, mpfr_sin, 1, 0, d1, 0)
+ F (__s_cos, __s_cos, cosl, mpfr_cos, 1, 0, d1, 0)
+ F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0)
+ F (__s_log, __s_log, logl, mpfr_log, 1, 0, d1, 0)
+ F (__s_pow, __s_pow, powl, mpfr_pow, 2, 0, d2, 0)
+#if __aarch64__
+ F (__v_sinf, v_sinf, sin, mpfr_sin, 1, 1, f1, 1)
+ F (__v_cosf, v_cosf, cos, mpfr_cos, 1, 1, f1, 1)
+ F (__v_expf_1u, v_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__v_expf, v_expf, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__v_exp2f_1u, v_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
+ F (__v_exp2f, v_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
+ F (__v_logf, v_logf, log, mpfr_log, 1, 1, f1, 1)
+ F (__v_powf, v_powf, pow, mpfr_pow, 2, 1, f2, 1)
+ F (__v_sin, v_sin, sinl, mpfr_sin, 1, 0, d1, 1)
+ F (__v_cos, v_cos, cosl, mpfr_cos, 1, 0, d1, 1)
+ F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1)
+ F (__v_log, v_log, logl, mpfr_log, 1, 0, d1, 1)
+ F (__v_pow, v_pow, powl, mpfr_pow, 2, 0, d2, 1)
+#ifdef __vpcs
+ F (__vn_sinf, vn_sinf, sin, mpfr_sin, 1, 1, f1, 1)
+ F (__vn_cosf, vn_cosf, cos, mpfr_cos, 1, 1, f1, 1)
+ F (__vn_expf_1u, vn_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__vn_expf, vn_expf, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__vn_exp2f_1u, vn_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
+ F (__vn_exp2f, vn_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
+ F (__vn_logf, vn_logf, log, mpfr_log, 1, 1, f1, 1)
+ F (__vn_powf, vn_powf, pow, mpfr_pow, 2, 1, f2, 1)
+ F (__vn_sin, vn_sin, sinl, mpfr_sin, 1, 0, d1, 1)
+ F (__vn_cos, vn_cos, cosl, mpfr_cos, 1, 0, d1, 1)
+ F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1)
+ F (__vn_log, vn_log, logl, mpfr_log, 1, 0, d1, 1)
+ F (__vn_pow, vn_pow, powl, mpfr_pow, 2, 0, d2, 1)
+ F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1)
+ F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1)
+ F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1)
+ F (_ZGVnN4v_exp2f, Z_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
+ F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1)
+ F (_ZGVnN4vv_powf, Z_powf, pow, mpfr_pow, 2, 1, f2, 1)
+ F (_ZGVnN2v_sin, Z_sin, sinl, mpfr_sin, 1, 0, d1, 1)
+ F (_ZGVnN2v_cos, Z_cos, cosl, mpfr_cos, 1, 0, d1, 1)
+ F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1)
+ F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1)
+ F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1)
+#endif
+#endif
+#endif
diff --git a/math/test/ulp_wrappers.h b/math/test/ulp_wrappers.h
new file mode 100644
index 0000000..fd9e00c
--- /dev/null
+++ b/math/test/ulp_wrappers.h
@@ -0,0 +1,59 @@
+/*
+ * Function wrappers for ulp.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Wrappers for sincos.  */
+static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);}
+static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);}
+static double sincos_sin(double x) {(void)cos(x); return sin(x);}
+static double sincos_cos(double x) {(void)sin(x); return cos(x);}
+#if USE_MPFR
+static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); }
+static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); }
+#endif
+
+/* Wrappers for vector functions.  */
+#if __aarch64__ && WANT_VMATH
+static float v_sinf(float x) { return __v_sinf(argf(x))[0]; }
+static float v_cosf(float x) { return __v_cosf(argf(x))[0]; }
+static float v_expf_1u(float x) { return __v_expf_1u(argf(x))[0]; }
+static float v_expf(float x) { return __v_expf(argf(x))[0]; }
+static float v_exp2f_1u(float x) { return __v_exp2f_1u(argf(x))[0]; }
+static float v_exp2f(float x) { return __v_exp2f(argf(x))[0]; }
+static float v_logf(float x) { return __v_logf(argf(x))[0]; }
+static float v_powf(float x, float y) { return __v_powf(argf(x),argf(y))[0]; }
+static double v_sin(double x) { return __v_sin(argd(x))[0]; }
+static double v_cos(double x) { return __v_cos(argd(x))[0]; }
+static double v_exp(double x) { return __v_exp(argd(x))[0]; }
+static double v_log(double x) { return __v_log(argd(x))[0]; }
+static double v_pow(double x, double y) { return __v_pow(argd(x),argd(y))[0]; }
+#ifdef __vpcs
+static float vn_sinf(float x) { return __vn_sinf(argf(x))[0]; }
+static float vn_cosf(float x) { return __vn_cosf(argf(x))[0]; }
+static float vn_expf_1u(float x) { return __vn_expf_1u(argf(x))[0]; }
+static float vn_expf(float x) { return __vn_expf(argf(x))[0]; }
+static float vn_exp2f_1u(float x) { return __vn_exp2f_1u(argf(x))[0]; }
+static float vn_exp2f(float x) { return __vn_exp2f(argf(x))[0]; }
+static float vn_logf(float x) { return __vn_logf(argf(x))[0]; }
+static float vn_powf(float x, float y) { return __vn_powf(argf(x),argf(y))[0]; }
+static double vn_sin(double x) { return __vn_sin(argd(x))[0]; }
+static double vn_cos(double x) { return __vn_cos(argd(x))[0]; }
+static double vn_exp(double x) { return __vn_exp(argd(x))[0]; }
+static double vn_log(double x) { return __vn_log(argd(x))[0]; }
+static double vn_pow(double x, double y) { return __vn_pow(argd(x),argd(y))[0]; }
+static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; }
+static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; }
+static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; }
+static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; }
+static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; }
+static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; }
+static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; }
+static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; }
+static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; }
+static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; }
+static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; }
+#endif
+#endif
diff --git a/math/tools/cos.sollya b/math/tools/cos.sollya
index bd72d6b..6690adf 100644
--- a/math/tools/cos.sollya
+++ b/math/tools/cos.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating cos(x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 8;   // polynomial degree
 a = -pi/4; // interval
diff --git a/math/tools/exp.sollya b/math/tools/exp.sollya
index b7a462c..0668bdb 100644
--- a/math/tools/exp.sollya
+++ b/math/tools/exp.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating e^x
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 5; // poly degree
 N = 128; // table entries
diff --git a/math/tools/exp2.sollya b/math/tools/exp2.sollya
index e760769..bd0a42d 100644
--- a/math/tools/exp2.sollya
+++ b/math/tools/exp2.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating 2^x
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 // exp2f parameters
 deg = 3; // poly degree
diff --git a/math/tools/log.sollya b/math/tools/log.sollya
index 6df4db4..5288f55 100644
--- a/math/tools/log.sollya
+++ b/math/tools/log.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating log(1+x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 12; // poly degree
 // |log(1+x)| > 0x1p-4 outside the interval
diff --git a/math/tools/log2.sollya b/math/tools/log2.sollya
index 4a364c0..85811be 100644
--- a/math/tools/log2.sollya
+++ b/math/tools/log2.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating log2(1+x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 11; // poly degree
 // |log2(1+x)| > 0x1p-4 outside the interval
diff --git a/math/tools/log2_abs.sollya b/math/tools/log2_abs.sollya
index 82c4dac..d018ba0 100644
--- a/math/tools/log2_abs.sollya
+++ b/math/tools/log2_abs.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating log2(1+x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 7; // poly degree
 // interval ~= 1/(2*N), where N is the table entries
diff --git a/math/tools/log_abs.sollya b/math/tools/log_abs.sollya
index a2ac190..5f9bfe4 100644
--- a/math/tools/log_abs.sollya
+++ b/math/tools/log_abs.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating log(1+x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 6; // poly degree
 // interval ~= 1/(2*N), where N is the table entries
diff --git a/math/tools/plot.py b/math/tools/plot.py
index 6c8b89f..a0fa023 100755
--- a/math/tools/plot.py
+++ b/math/tools/plot.py
@@ -3,7 +3,7 @@
 # ULP error plot tool.
 #
 # Copyright (c) 2019, Arm Limited.
-# SPDX-License-Identifier: MIT
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/math/tools/remez.jl b/math/tools/remez.jl
index 2ff436f..1deab67 100755
--- a/math/tools/remez.jl
+++ b/math/tools/remez.jl
@@ -4,7 +4,7 @@
 # remez.jl - implementation of the Remez algorithm for polynomial approximation
 #
 # Copyright (c) 2015-2019, Arm Limited.
-# SPDX-License-Identifier: MIT
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 import Base.\
 
diff --git a/math/tools/sin.sollya b/math/tools/sin.sollya
index a6e8511..a193000 100644
--- a/math/tools/sin.sollya
+++ b/math/tools/sin.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating sin(x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 7;   // polynomial degree
 a = -pi/4; // interval
diff --git a/math/tools/v_exp.sollya b/math/tools/v_exp.sollya
index c0abb63..5fa7de7 100644
--- a/math/tools/v_exp.sollya
+++ b/math/tools/v_exp.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating e^x
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 4; // poly degree
 N = 128; // table entries
diff --git a/math/tools/v_log.sollya b/math/tools/v_log.sollya
index cc3d2c4..d982524 100644
--- a/math/tools/v_log.sollya
+++ b/math/tools/v_log.sollya
@@ -1,7 +1,7 @@
 // polynomial used for __v_log(x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 6; // poly degree
 a = -0x1.fc1p-9;
diff --git a/math/tools/v_sin.sollya b/math/tools/v_sin.sollya
index 65cc995..63b9d65 100644
--- a/math/tools/v_sin.sollya
+++ b/math/tools/v_sin.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating sin(x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 15;  // polynomial degree
 a = -pi/2; // interval
diff --git a/math/v_cos.c b/math/v_cos.c
index 20ba6bd..4c8787e 100644
--- a/math/v_cos.c
+++ b/math/v_cos.c
@@ -1,8 +1,8 @@
 /*
  * Double-precision vector cos function.
  *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
@@ -55,6 +55,14 @@
   r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask);
   cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal));
 
+#if WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u64 (cmp)))
+    /* If fenv exceptions are to be triggered correctly, set any special lanes
+       to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+       specialcase later.  */
+    r = v_sel_f64 (cmp, v_f64 (1.0), r);
+#endif
+
   /* n = rint((|x|+pi/2)/pi) - 0.5.  */
   n = v_fma_f64 (InvPi, r + HalfPi, Shift);
   odd = v_as_u64_f64 (n) << 63;
diff --git a/math/v_cosf.c b/math/v_cosf.c
index 150294b..bd677c3 100644
--- a/math/v_cosf.c
+++ b/math/v_cosf.c
@@ -1,8 +1,8 @@
 /*
  * Single-precision vector cos function.
  *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
@@ -47,6 +47,14 @@
   r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask);
   cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal));
 
+#if WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u32 (cmp)))
+    /* If fenv exceptions are to be triggered correctly, set any special lanes
+       to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+       specialcase later.  */
+    r = v_sel_f32 (cmp, v_f32 (1.0f), r);
+#endif
+
   /* n = rint((|x|+pi/2)/pi) - 0.5 */
   n = v_fma_f32 (InvPi, r + HalfPi, Shift);
   odd = v_as_u32_f32 (n) << 31;
diff --git a/math/v_exp.c b/math/v_exp.c
index e459d53..da23fd1 100644
--- a/math/v_exp.c
+++ b/math/v_exp.c
@@ -1,8 +1,8 @@
 /*
  * Double-precision vector e^x function.
  *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
@@ -36,6 +36,22 @@
 #define Tab __v_exp_data
 #define IndexMask v_u64 (N - 1)
 #define Shift v_f64 (0x1.8p+52)
+
+#if WANT_SIMD_EXCEPT
+
+#define TinyBound 0x200 /* top12 (asuint64 (0x1p-511)).  */
+#define BigBound 0x408	/* top12 (asuint64 (0x1p9)).  */
+
+VPCS_ATTR static NOINLINE v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+{
+  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+     routine to special lanes.  */
+  return v_call_f64 (exp, x, y, cmp);
+}
+
+#else
+
 #define Thres v_f64 (704.0)
 
 VPCS_ATTR
@@ -54,6 +70,8 @@
   return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0)));
 }
 
+#endif
+
 VPCS_ATTR
 v_f64_t
 V_NAME(exp) (v_f64_t x)
@@ -61,7 +79,18 @@
   v_f64_t n, r, r2, s, y, z;
   v_u64_t cmp, u, e, i;
 
+#if WANT_SIMD_EXCEPT
+  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+     specialcase to fix special lanes later. This is only necessary if fenv
+     exceptions are to be triggered correctly.  */
+  v_f64_t xm = x;
+  cmp = v_cond_u64 ((v_as_u64_f64 (v_abs_f64 (x)) >> 52) - TinyBound
+		    >= BigBound - TinyBound);
+  if (unlikely (v_any_u64 (cmp)))
+    x = v_sel_f64 (cmp, v_f64 (1), x);
+#else
   cmp = v_cond_u64 (v_abs_f64 (x) > Thres);
+#endif
 
   /* n = round(x/(ln2/N)).  */
   z = v_fma_f64 (x, InvLn2, Shift);
@@ -87,7 +116,12 @@
   s = v_as_f64_u64 (u + e);
 
   if (unlikely (v_any_u64 (cmp)))
+#if WANT_SIMD_EXCEPT
+    return specialcase (xm, v_fma_f64 (y, s, s), cmp);
+#else
     return specialcase (s, y, n);
+#endif
+
   return v_fma_f64 (y, s, s);
 }
 VPCS_ALIAS
diff --git a/math/v_exp.h b/math/v_exp.h
index 305da19..1e7f7f3 100644
--- a/math/v_exp.h
+++ b/math/v_exp.h
@@ -2,7 +2,7 @@
  * Declarations for double-precision e^x vector function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "v_math.h"
diff --git a/math/v_exp2f.c b/math/v_exp2f.c
index e3ea5af..7f40dba 100644
--- a/math/v_exp2f.c
+++ b/math/v_exp2f.c
@@ -1,8 +1,8 @@
 /*
  * Single-precision vector 2^x function.
  *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
@@ -25,6 +25,22 @@
 
 #define Shift v_f32 (0x1.8p23f)
 
+#if WANT_SIMD_EXCEPT
+
+#define TinyBound 0x20000000 /* asuint (0x1p-63).  */
+#define BigBound 0x42800000  /* asuint (0x1p6).  */
+
+VPCS_ATTR
+static NOINLINE v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+     routine to special lanes.  */
+  return v_call_f32 (exp2f, x, y, cmp);
+}
+
+#else
+
 VPCS_ATTR
 static v_f32_t
 specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
@@ -41,15 +57,28 @@
   return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
 }
 
+#endif
+
 VPCS_ATTR
 v_f32_t
 V_NAME(exp2f) (v_f32_t x)
 {
-  v_f32_t n, r, r2, scale, p, q, poly, absn;
+  v_f32_t n, r, r2, scale, p, q, poly;
   v_u32_t cmp, e;
 
-  /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
-     x = n + r, with r in [-1/2, 1/2].  */
+#if WANT_SIMD_EXCEPT
+  cmp = v_cond_u32 ((v_as_u32_f32 (x) & 0x7fffffff) - TinyBound
+		    >= BigBound - TinyBound);
+  v_f32_t xm = x;
+  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+     specialcase to fix special lanes later. This is only necessary if fenv
+     exceptions are to be triggered correctly.  */
+  if (unlikely (v_any_u32 (cmp)))
+    x = v_sel_f32 (cmp, v_f32 (1), x);
+#endif
+
+    /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+       x = n + r, with r in [-1/2, 1/2].  */
 #if 0
   v_f32_t z;
   z = x + Shift;
@@ -62,16 +91,26 @@
   e = v_as_u32_s32 (v_round_s32 (x)) << 23;
 #endif
   scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
-  absn = v_abs_f32 (n);
+
+#if !WANT_SIMD_EXCEPT
+  v_f32_t absn = v_abs_f32 (n);
   cmp = v_cond_u32 (absn > v_f32 (126.0f));
+#endif
+
   r2 = r * r;
   p = v_fma_f32 (C0, r, C1);
   q = v_fma_f32 (C2, r, C3);
   q = v_fma_f32 (p, r2, q);
   p = C4 * r;
   poly = v_fma_f32 (q, r2, p);
+
   if (unlikely (v_any_u32 (cmp)))
+#if WANT_SIMD_EXCEPT
+    return specialcase (xm, v_fma_f32 (poly, scale, scale), cmp);
+#else
     return specialcase (poly, n, e, absn, cmp, scale);
+#endif
+
   return v_fma_f32 (poly, scale, scale);
 }
 VPCS_ALIAS
diff --git a/math/v_exp2f_1u.c b/math/v_exp2f_1u.c
index 1caa14d..de1a32d 100644
--- a/math/v_exp2f_1u.c
+++ b/math/v_exp2f_1u.c
@@ -2,7 +2,7 @@
  * Single-precision vector 2^x function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
diff --git a/math/v_exp_data.c b/math/v_exp_data.c
index 3653554..30421da 100644
--- a/math/v_exp_data.c
+++ b/math/v_exp_data.c
@@ -2,7 +2,7 @@
  * Lookup table for double-precision e^x vector function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "v_exp.h"
diff --git a/math/v_expf.c b/math/v_expf.c
index d403e00..ade23b2 100644
--- a/math/v_expf.c
+++ b/math/v_expf.c
@@ -1,8 +1,8 @@
 /*
  * Single-precision vector e^x function.
  *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
@@ -28,6 +28,22 @@
 #define Ln2hi v_f32 (0x1.62e4p-1f)
 #define Ln2lo v_f32 (0x1.7f7d1cp-20f)
 
+#if WANT_SIMD_EXCEPT
+
+#define TinyBound 0x20000000 /* asuint (0x1p-63).  */
+#define BigBound 0x42800000  /* asuint (0x1p6).  */
+
+VPCS_ATTR
+static NOINLINE v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+     routine to special lanes.  */
+  return v_call_f32 (expf, x, y, cmp);
+}
+
+#else
+
 VPCS_ATTR
 static v_f32_t
 specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
@@ -44,15 +60,28 @@
   return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
 }
 
+#endif
+
 VPCS_ATTR
 v_f32_t
 V_NAME(expf) (v_f32_t x)
 {
-  v_f32_t n, r, r2, scale, p, q, poly, absn, z;
+  v_f32_t n, r, r2, scale, p, q, poly, z;
   v_u32_t cmp, e;
 
-  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
-     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+#if WANT_SIMD_EXCEPT
+  cmp = v_cond_u32 ((v_as_u32_f32 (x) & 0x7fffffff) - TinyBound
+		    >= BigBound - TinyBound);
+  v_f32_t xm = x;
+  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+     specialcase to fix special lanes later. This is only necessary if fenv
+     exceptions are to be triggered correctly.  */
+  if (unlikely (v_any_u32 (cmp)))
+    x = v_sel_f32 (cmp, v_f32 (1), x);
+#endif
+
+    /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+       x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
 #if 1
   z = v_fma_f32 (x, InvLn2, Shift);
   n = z - Shift;
@@ -67,16 +96,26 @@
   e = v_as_u32_s32 (v_round_s32 (z)) << 23;
 #endif
   scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
-  absn = v_abs_f32 (n);
+
+#if !WANT_SIMD_EXCEPT
+  v_f32_t absn = v_abs_f32 (n);
   cmp = v_cond_u32 (absn > v_f32 (126.0f));
+#endif
+
   r2 = r * r;
   p = v_fma_f32 (C0, r, C1);
   q = v_fma_f32 (C2, r, C3);
   q = v_fma_f32 (p, r2, q);
   p = C4 * r;
   poly = v_fma_f32 (q, r2, p);
+
   if (unlikely (v_any_u32 (cmp)))
+#if WANT_SIMD_EXCEPT
+    return specialcase (xm, v_fma_f32 (poly, scale, scale), cmp);
+#else
     return specialcase (poly, n, e, absn, cmp, scale);
+#endif
+
   return v_fma_f32 (poly, scale, scale);
 }
 VPCS_ALIAS
diff --git a/math/v_expf_1u.c b/math/v_expf_1u.c
index 023bd24..8f0ae91 100644
--- a/math/v_expf_1u.c
+++ b/math/v_expf_1u.c
@@ -2,7 +2,7 @@
  * Single-precision vector e^x function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
diff --git a/math/v_log.c b/math/v_log.c
index d84c740..47a8291 100644
--- a/math/v_log.c
+++ b/math/v_log.c
@@ -2,7 +2,7 @@
  * Double-precision vector log(x) function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
diff --git a/math/v_log.h b/math/v_log.h
index bcc2fa6..a37bbc2 100644
--- a/math/v_log.h
+++ b/math/v_log.h
@@ -2,7 +2,7 @@
  * Declarations for double-precision log(x) vector function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "v_math.h"
diff --git a/math/v_log_data.c b/math/v_log_data.c
index 97ee5b0..ec1c8e5 100644
--- a/math/v_log_data.c
+++ b/math/v_log_data.c
@@ -2,7 +2,7 @@
  * Lookup table for double-precision log(x) vector function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "v_log.h"
diff --git a/math/v_logf.c b/math/v_logf.c
index 7373192..93a5375 100644
--- a/math/v_logf.c
+++ b/math/v_logf.c
@@ -2,7 +2,7 @@
  * Single-precision vector log function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
diff --git a/math/v_math.h b/math/v_math.h
index f2cc467..3289916 100644
--- a/math/v_math.h
+++ b/math/v_math.h
@@ -1,8 +1,8 @@
 /*
  * Vector math abstractions.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _V_MATH_H
@@ -191,6 +191,11 @@
 {
   return __builtin_lroundf (x); /* relies on -fno-math-errno.  */
 }
+static inline v_f32_t
+v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y)
+{
+  return p ? x : y;
+}
 /* convert to type1 from type2.  */
 static inline v_f32_t
 v_to_f32_s32 (v_s32_t x)
@@ -311,6 +316,11 @@
 {
   return __builtin_lround (x); /* relies on -fno-math-errno.  */
 }
+static inline v_f64_t
+v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
+{
+  return p ? x : y;
+}
 /* convert to type1 from type2.  */
 static inline v_f64_t
 v_to_f64_s64 (v_s64_t x)
@@ -460,6 +470,11 @@
 {
   return vcvtaq_s32_f32 (x);
 }
+static inline v_f32_t
+v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y)
+{
+  return vbslq_f32 (p, x, y);
+}
 /* convert to type1 from type2.  */
 static inline v_f32_t
 v_to_f32_s32 (v_s32_t x)
@@ -584,6 +599,11 @@
 {
   return vcvtaq_s64_f64 (x);
 }
+static inline v_f64_t
+v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
+{
+  return vbslq_f64 (p, x, y);
+}
 /* convert to type1 from type2.  */
 static inline v_f64_t
 v_to_f64_s64 (v_s64_t x)
diff --git a/math/v_pow.c b/math/v_pow.c
index a209d57..05a83aa 100644
--- a/math/v_pow.c
+++ b/math/v_pow.c
@@ -2,7 +2,7 @@
  * Double-precision vector pow function.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
diff --git a/math/v_powf.c b/math/v_powf.c
index fb80fa6..ad8ab8d 100644
--- a/math/v_powf.c
+++ b/math/v_powf.c
@@ -2,7 +2,7 @@
  * Single-precision vector powf function.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
diff --git a/math/v_sin.c b/math/v_sin.c
index 2b9ed05..9dbb9de 100644
--- a/math/v_sin.c
+++ b/math/v_sin.c
@@ -1,8 +1,8 @@
 /*
  * Double-precision vector sin function.
  *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
@@ -34,9 +34,15 @@
 #define Pi2 v_f64 (0x1.1a62633145c06p-53)
 #define Pi3 v_f64 (0x1.c1cd129024e09p-106)
 #define Shift v_f64 (0x1.8p52)
-#define RangeVal v_f64 (0x1p23)
 #define AbsMask v_u64 (0x7fffffffffffffff)
 
+#if WANT_SIMD_EXCEPT
+#define TinyBound 0x202 /* top12 (asuint64 (0x1p-509)).  */
+#define Thresh 0x214	/* top12 (asuint64 (RangeVal)) - TinyBound.  */
+#else
+#define RangeVal v_f64 (0x1p23)
+#endif
+
 VPCS_ATTR
 __attribute__ ((noinline)) static v_f64_t
 specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
@@ -49,11 +55,22 @@
 V_NAME(sin) (v_f64_t x)
 {
   v_f64_t n, r, r2, y;
-  v_u64_t sign, odd, cmp;
+  v_u64_t sign, odd, cmp, ir;
 
-  r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask);
+  ir = v_as_u64_f64 (x) & AbsMask;
+  r = v_as_f64_u64 (ir);
   sign = v_as_u64_f64 (x) & ~AbsMask;
-  cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal));
+
+#if WANT_SIMD_EXCEPT
+  /* Detect |x| <= 0x1p-509 or |x| >= RangeVal. If fenv exceptions are to be
+     triggered correctly, set any special lanes to 1 (which is neutral w.r.t.
+     fenv). These lanes will be fixed by specialcase later.  */
+  cmp = v_cond_u64 ((ir >> 52) - TinyBound >= Thresh);
+  if (unlikely (v_any_u64 (cmp)))
+    r = v_sel_f64 (cmp, v_f64 (1), r);
+#else
+  cmp = v_cond_u64 (ir >= v_as_u64_f64 (RangeVal));
+#endif
 
   /* n = rint(|x|/pi).  */
   n = v_fma_f64 (InvPi, r, Shift);
diff --git a/math/v_sinf.c b/math/v_sinf.c
index e66bfce..ce35dac 100644
--- a/math/v_sinf.c
+++ b/math/v_sinf.c
@@ -1,8 +1,8 @@
 /*
  * Single-precision vector sin function.
  *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
@@ -24,6 +24,7 @@
 #define A7 v_f32 (Poly[1])
 #define A9 v_f32 (Poly[0])
 #define RangeVal v_f32 (0x1p20f)
+#define TinyBound v_f32 (0x1p-61f)
 #define InvPi v_f32 (0x1.45f306p-2f)
 #define Shift v_f32 (0x1.8p+23f)
 #define AbsMask v_u32 (0x7fffffff)
@@ -41,11 +42,23 @@
 V_NAME(sinf) (v_f32_t x)
 {
   v_f32_t n, r, r2, y;
-  v_u32_t sign, odd, cmp;
+  v_u32_t sign, odd, cmp, ir;
 
-  r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask);
+  ir = v_as_u32_f32 (x) & AbsMask;
+  r = v_as_f32_u32 (ir);
   sign = v_as_u32_f32 (x) & ~AbsMask;
-  cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal));
+
+#if WANT_SIMD_EXCEPT
+  cmp = v_cond_u32 ((ir - v_as_u32_f32 (TinyBound)
+		     >= v_as_u32_f32 (RangeVal) - v_as_u32_f32 (TinyBound)));
+  if (unlikely (v_any_u32 (cmp)))
+    /* If fenv exceptions are to be triggered correctly, set any special lanes
+       to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+       specialcase later.  */
+    r = v_sel_f32 (cmp, v_f32 (1), r);
+#else
+  cmp = v_cond_u32 (ir >= v_as_u32_f32 (RangeVal));
+#endif
 
   /* n = rint(|x|/pi) */
   n = v_fma_f32 (InvPi, r, Shift);
diff --git a/math/vn_cos.c b/math/vn_cos.c
index b57a549..4b5b237 100644
--- a/math/vn_cos.c
+++ b/math/vn_cos.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_cos.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/math/vn_cosf.c b/math/vn_cosf.c
index 6321d46..86dd26e 100644
--- a/math/vn_cosf.c
+++ b/math/vn_cosf.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_cosf.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/math/vn_exp.c b/math/vn_exp.c
index 06e269d..0d85b17 100644
--- a/math/vn_exp.c
+++ b/math/vn_exp.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_exp.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/math/vn_exp2f.c b/math/vn_exp2f.c
index db9707e..da3bb40 100644
--- a/math/vn_exp2f.c
+++ b/math/vn_exp2f.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_exp2f.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/math/vn_exp2f_1u.c b/math/vn_exp2f_1u.c
index 17bd0ab..3e3a247 100644
--- a/math/vn_exp2f_1u.c
+++ b/math/vn_exp2f_1u.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_exp2f_1u.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/math/vn_expf.c b/math/vn_expf.c
index 0652907..6e91a94 100644
--- a/math/vn_expf.c
+++ b/math/vn_expf.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_expf.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/math/vn_expf_1u.c b/math/vn_expf_1u.c
index 3be7768..57ae6a3 100644
--- a/math/vn_expf_1u.c
+++ b/math/vn_expf_1u.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_expf_1u.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/math/vn_log.c b/math/vn_log.c
index b58fe8f..902bff1 100644
--- a/math/vn_log.c
+++ b/math/vn_log.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_log.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/math/vn_logf.c b/math/vn_logf.c
index cc5b8ae..07e4936 100644
--- a/math/vn_logf.c
+++ b/math/vn_logf.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_logf.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/math/vn_pow.c b/math/vn_pow.c
index 2609501..1a980ff 100644
--- a/math/vn_pow.c
+++ b/math/vn_pow.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_pow.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/math/vn_powf.c b/math/vn_powf.c
index 095d07e..a42ade3 100644
--- a/math/vn_powf.c
+++ b/math/vn_powf.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_powf.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/math/vn_sin.c b/math/vn_sin.c
index 905c796..64b05c8 100644
--- a/math/vn_sin.c
+++ b/math/vn_sin.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_sin.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/math/vn_sinf.c b/math/vn_sinf.c
index 1214e1a..6e880c6 100644
--- a/math/vn_sinf.c
+++ b/math/vn_sinf.c
@@ -2,7 +2,7 @@
  * AdvSIMD vector PCS variant of __v_sinf.
  *
  * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 #include "mathlib.h"
 #ifdef __vpcs
diff --git a/networking/Dir.mk b/networking/Dir.mk
index b496103..2589e0a 100644
--- a/networking/Dir.mk
+++ b/networking/Dir.mk
@@ -1,7 +1,7 @@
 # Makefile fragment - requires GNU make
 #
 # Copyright (c) 2019-2020, Arm Limited.
-# SPDX-License-Identifier: MIT
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 S := $(srcdir)/networking
 B := build/networking
diff --git a/networking/aarch64/chksum_simd.c b/networking/aarch64/chksum_simd.c
index 6d5be58..90c00eb 100644
--- a/networking/aarch64/chksum_simd.c
+++ b/networking/aarch64/chksum_simd.c
@@ -2,7 +2,7 @@
  * AArch64-specific checksum implementation using NEON
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "networking.h"
diff --git a/networking/arm/chksum_simd.c b/networking/arm/chksum_simd.c
index 7f69adf..ae08fe5 100644
--- a/networking/arm/chksum_simd.c
+++ b/networking/arm/chksum_simd.c
@@ -2,7 +2,7 @@
  * Armv7-A specific checksum implementation using NEON
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "networking.h"
diff --git a/networking/chksum.c b/networking/chksum.c
index 95ce5ba..329482f 100644
--- a/networking/chksum.c
+++ b/networking/chksum.c
@@ -3,7 +3,7 @@
  * This sum is often used as a simple checksum in networking.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "networking.h"
diff --git a/networking/chksum_common.h b/networking/chksum_common.h
index 958c8cc..16f0f6c 100644
--- a/networking/chksum_common.h
+++ b/networking/chksum_common.h
@@ -2,7 +2,7 @@
  * Common code for checksum implementations
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef CHKSUM_COMMON_H
diff --git a/networking/include/networking.h b/networking/include/networking.h
index a88feff..297dd4b 100644
--- a/networking/include/networking.h
+++ b/networking/include/networking.h
@@ -2,7 +2,7 @@
  * Public API.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 unsigned short __chksum (const void *, unsigned int);
diff --git a/networking/test/chksum.c b/networking/test/chksum.c
index 41b9812..239b5b8 100644
--- a/networking/test/chksum.c
+++ b/networking/test/chksum.c
@@ -2,7 +2,7 @@
  * Ones' complement checksum test & benchmark
  *
  * Copyright (c) 2016-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define _GNU_SOURCE
diff --git a/pl/Dir.mk b/pl/Dir.mk
new file mode 100644
index 0000000..2d00779
--- /dev/null
+++ b/pl/Dir.mk
@@ -0,0 +1,21 @@
+# Makefile fragment - requires GNU make
+#
+# Copyright (c) 2022, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+# These targets are defined if we prescribe pl in SUBS.
+# It requires PLSUBS to be set.
+
+$(foreach sub,$(PLSUBS),$(eval include $(srcdir)/pl/$(sub)/Dir.mk))
+
+pl-files := $($(PLSUBS:%=pl/%-files))
+
+all-pl: $(PLSUBS:%=all-pl/%)
+
+check-pl: $(PLSUBS:%=check-pl/%)
+
+install-pl: $(PLSUBS:%=install-pl/%)
+
+clean-pl: $(PLSUBS:%=clean-pl/%)
+
+.PHONY: all-pl check-pl install-pl clean-pl
diff --git a/pl/README.contributors b/pl/README.contributors
new file mode 100644
index 0000000..3af9b1f
--- /dev/null
+++ b/pl/README.contributors
@@ -0,0 +1,23 @@
+Code in this sub-directory should follow the GNU Coding Standard, but it is
+not expected to be upstreamed into glibc without modification, so
+glibc-specific conventions need not be followed.
+
+The requirements for portable code apply to non-portable code with the
+following differences:
+
+
+1. Worst-case ULP error should be encoded in filenames (e.g. sin_u35.c). There
+   are no specific restrictions on acceptable ULP error, but if functions
+   provide significantly less accuracy than portable equivalents then a clear
+   justification for inclusion should be stated in comments at the top of the
+   source file. Error bounds of the approximation should be clearly documented
+   in comments.
+
+2. Functions are assumed to support round-to-nearest mode by default, unless
+   stated; other rounding modes are not required to be provided.
+
+3. Handling of special cases may be relaxed for vector functions. Checking
+   whether each vector lane contains special values such as NaN, Inf or
+   denormal numbers can prove too costly for vector functions. This is often
+   not required since vector functions are typically used along with aggressive
+   compiler optimization flags.
diff --git a/pl/math/Dir.mk b/pl/math/Dir.mk
new file mode 100644
index 0000000..be65344
--- /dev/null
+++ b/pl/math/Dir.mk
@@ -0,0 +1,229 @@
+# Makefile fragment - requires GNU make
+#
+# Copyright (c) 2019-2023, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+PLM := $(srcdir)/pl/math
+AOR := $(srcdir)/math
+B := build/pl/math
+
+math-lib-srcs := $(wildcard $(PLM)/*.[cS])
+math-test-srcs := \
+	$(AOR)/test/mathtest.c \
+	$(AOR)/test/mathbench.c \
+	$(AOR)/test/ulp.c \
+
+math-test-host-srcs := $(wildcard $(AOR)/test/rtest/*.[cS])
+
+math-includes := $(patsubst $(PLM)/%,build/pl/%,$(wildcard $(PLM)/include/*.h))
+math-test-includes := $(patsubst $(PLM)/%,build/pl/include/%,$(wildcard $(PLM)/test/*.h))
+
+math-libs := \
+	build/pl/lib/libmathlib.so \
+	build/pl/lib/libmathlib.a \
+
+math-tools := \
+	build/pl/bin/mathtest \
+	build/pl/bin/mathbench \
+	build/pl/bin/mathbench_libc \
+	build/pl/bin/runulp.sh \
+	build/pl/bin/ulp \
+
+math-host-tools := \
+	build/pl/bin/rtest \
+
+math-lib-objs := $(patsubst $(PLM)/%,$(B)/%.o,$(basename $(math-lib-srcs)))
+math-test-objs := $(patsubst $(AOR)/%,$(B)/%.o,$(basename $(math-test-srcs)))
+math-host-objs := $(patsubst $(AOR)/%,$(B)/%.o,$(basename $(math-test-host-srcs)))
+math-target-objs := $(math-lib-objs) $(math-test-objs)
+math-objs := $(math-target-objs) $(math-target-objs:%.o=%.os) $(math-host-objs)
+
+pl/math-files := \
+	$(math-objs) \
+	$(math-libs) \
+	$(math-tools) \
+	$(math-host-tools) \
+	$(math-includes) \
+	$(math-test-includes) \
+
+all-pl/math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes)
+
+$(math-objs): $(math-includes) $(math-test-includes)
+$(math-objs): CFLAGS_PL += $(math-cflags)
+$(B)/test/mathtest.o: CFLAGS_PL += -fmath-errno
+$(math-host-objs): CC = $(HOST_CC)
+$(math-host-objs): CFLAGS_PL = $(HOST_CFLAGS)
+
+build/pl/include/test/ulp_funcs_gen.h: $(math-lib-srcs)
+	# Replace PL_SIG
+	cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=_Z##v##t##a(f)" -P > $@
+
+build/pl/include/test/mathbench_funcs_gen.h: $(math-lib-srcs)
+	# Replace PL_SIG macros with mathbench func entries
+	cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=_Z##v##t##a(f, ##__VA_ARGS__)" -P > $@
+
+build/pl/include/test/ulp_wrappers_gen.h: $(math-lib-srcs)
+	# Replace PL_SIG macros with ULP wrapper declarations
+	cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=Z##v##N##t##a##_WRAP(f)" -P > $@
+
+$(B)/test/ulp.o: $(AOR)/test/ulp.h build/pl/include/test/ulp_funcs_gen.h build/pl/include/test/ulp_wrappers_gen.h
+$(B)/test/ulp.o: CFLAGS_PL += -I build/pl/include/test
+
+$(B)/test/mathbench.o: build/pl/include/test/mathbench_funcs_gen.h
+$(B)/test/mathbench.o: CFLAGS_PL += -I build/pl/include/test
+
+build/pl/lib/libmathlib.so: $(math-lib-objs:%.o=%.os)
+	$(CC) $(CFLAGS_PL) $(LDFLAGS) -shared -o $@ $^
+
+build/pl/lib/libmathlib.a: $(math-lib-objs)
+	rm -f $@
+	$(AR) rc $@ $^
+	$(RANLIB) $@
+
+$(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc
+$(math-tools): LDLIBS += $(math-ldlibs) -lm
+
+# Some targets to build pl/math/test from math/test sources
+build/pl/math/test/%.o: $(srcdir)/math/test/%.S
+	$(CC) $(CFLAGS_PL) -c -o $@ $<
+
+build/pl/math/test/%.o: $(srcdir)/math/test/%.c
+	$(CC) $(CFLAGS_PL) -c -o $@ $<
+
+build/pl/math/test/%.os: $(srcdir)/math/test/%.S
+	$(CC) $(CFLAGS_PL) -c -o $@ $<
+
+build/pl/math/test/%.os: $(srcdir)/math/test/%.c
+	$(CC) $(CFLAGS_PL) -c -o $@ $<
+
+# Some targets to build pl/ sources using appropriate flags
+build/pl/%.o: $(srcdir)/pl/%.S
+	$(CC) $(CFLAGS_PL) -c -o $@ $<
+
+build/pl/%.o: $(srcdir)/pl/%.c
+	$(CC) $(CFLAGS_PL) -c -o $@ $<
+
+build/pl/%.os: $(srcdir)/pl/%.S
+	$(CC) $(CFLAGS_PL) -c -o $@ $<
+
+build/pl/%.os: $(srcdir)/pl/%.c
+	$(CC) $(CFLAGS_PL) -c -o $@ $<
+
+build/pl/bin/rtest: $(math-host-objs)
+	$(HOST_CC) $(HOST_CFLAGS) $(HOST_LDFLAGS) -o $@ $^ $(HOST_LDLIBS)
+
+build/pl/bin/mathtest: $(B)/test/mathtest.o build/pl/lib/libmathlib.a
+	$(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
+
+build/pl/bin/mathbench: $(B)/test/mathbench.o build/pl/lib/libmathlib.a
+	$(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
+
+# This is not ideal, but allows custom symbols in mathbench to get resolved.
+build/pl/bin/mathbench_libc: $(B)/test/mathbench.o build/pl/lib/libmathlib.a
+	$(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $< $(LDLIBS) -lc build/pl/lib/libmathlib.a -lm
+
+build/pl/bin/ulp: $(B)/test/ulp.o build/pl/lib/libmathlib.a
+	$(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
+
+build/pl/include/%.h: $(PLM)/include/%.h
+	cp $< $@
+
+build/pl/include/test/%.h: $(PLM)/test/%.h
+	cp $< $@
+
+build/pl/bin/%.sh: $(PLM)/test/%.sh
+	cp $< $@
+
+pl-math-tests := $(wildcard $(PLM)/test/testcases/directed/*.tst)
+pl-math-rtests := $(wildcard $(PLM)/test/testcases/random/*.tst)
+
+check-pl/math-test: $(math-tools)
+	cat $(pl-math-tests) | $(EMULATOR) build/pl/bin/mathtest $(math-testflags)
+
+check-pl/math-rtest: $(math-host-tools) $(math-tools)
+	cat $(pl-math-rtests) | build/pl/bin/rtest | $(EMULATOR) build/pl/bin/mathtest $(math-testflags)
+
+ulp-input-dir=$(B)/test/inputs
+
+math-lib-lims = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.ulp,$(basename $(math-lib-srcs)))
+math-lib-aliases = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.alias,$(basename $(math-lib-srcs)))
+math-lib-fenvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.fenv,$(basename $(math-lib-srcs)))
+math-lib-itvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.itv,$(basename $(math-lib-srcs)))
+
+ulp-inputs = $(math-lib-lims) $(math-lib-aliases) $(math-lib-fenvs) $(math-lib-itvs)
+
+$(ulp-inputs): CFLAGS_PL += -I$(PLM) -I$(PLM)/include $(math-cflags)
+
+$(ulp-input-dir)/%.ulp: $(PLM)/%.c
+	mkdir -p $(@D)
+	$(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep -o "PL_TEST_ULP [^ ]* [^ ]*" || true; } > $@
+
+$(ulp-input-dir)/%.alias: $(PLM)/%.c
+	mkdir -p $(@D)
+	$(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep "PL_TEST_ALIAS" || true; } | sed "s/_x / /g"> $@
+
+$(ulp-input-dir)/%.fenv: $(PLM)/%.c
+	mkdir -p $(@D)
+	$(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep -o "PL_TEST_EXPECT_FENV_ENABLED [^ ]*" || true; } > $@
+
+$(ulp-input-dir)/%.itv: $(PLM)/%.c
+	mkdir -p $(dir $@)
+	$(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep "PL_TEST_INTERVAL " || true; } | sed "s/ PL_TEST_INTERVAL/\nPL_TEST_INTERVAL/g" > $@
+
+ulp-lims := $(ulp-input-dir)/limits
+$(ulp-lims): $(math-lib-lims)
+	cat $^ | sed "s/PL_TEST_ULP //g;s/^ *//g" > $@
+
+ulp-aliases := $(ulp-input-dir)/aliases
+$(ulp-aliases): $(math-lib-aliases)
+	cat $^ | sed "s/PL_TEST_ALIAS //g;s/^ *//g" > $@
+
+fenv-exps := $(ulp-input-dir)/fenv
+$(fenv-exps): $(math-lib-fenvs)
+	cat $^ | sed "s/PL_TEST_EXPECT_FENV_ENABLED //g;s/^ *//g" > $@
+
+ulp-itvs-noalias := $(ulp-input-dir)/itvs_noalias
+$(ulp-itvs-noalias): $(math-lib-itvs)
+	cat $^ > $@
+
+rename-aliases := $(ulp-input-dir)/rename_alias.sed
+$(rename-aliases): $(ulp-aliases)
+	# Build sed script for replacing aliases from generated alias file
+	cat $< |  awk '{ print "s/ " $$1 " / " $$2 " /g" }' > $@
+
+ulp-itvs-alias := $(ulp-input-dir)/itvs_alias
+$(ulp-itvs-alias): $(ulp-itvs-noalias) $(rename-aliases)
+	cat $< | sed  -f $(rename-aliases) > $@
+
+ulp-itvs := $(ulp-input-dir)/intervals
+$(ulp-itvs): $(ulp-itvs-alias) $(ulp-itvs-noalias)
+	cat $^ | sort -u | sed "s/PL_TEST_INTERVAL //g" > $@
+
+check-pl/math-ulp: $(math-tools) $(ulp-lims) $(ulp-aliases) $(fenv-exps) $(ulp-itvs)
+	WANT_SVE_MATH=$(WANT_SVE_MATH) \
+	ULPFLAGS="$(math-ulpflags)" \
+	LIMITS=../../../$(ulp-lims) \
+	ALIASES=../../../$(ulp-aliases) \
+	INTERVALS=../../../$(ulp-itvs) \
+	FENV=../../../$(fenv-exps) \
+	build/pl/bin/runulp.sh $(EMULATOR)
+
+check-pl/math: check-pl/math-test check-pl/math-rtest check-pl/math-ulp
+
+$(DESTDIR)$(libdir)/pl/%.so: build/pl/lib/%.so
+	$(INSTALL) -D $< $@
+
+$(DESTDIR)$(libdir)/pl/%: build/pl/lib/%
+	$(INSTALL) -m 644 -D $< $@
+
+$(DESTDIR)$(includedir)/pl/%: build/pl/include/%
+	$(INSTALL) -m 644 -D $< $@
+
+install-pl/math: \
+ $(math-libs:build/pl/lib/%=$(DESTDIR)$(libdir)/pl/%) \
+ $(math-includes:build/pl/include/%=$(DESTDIR)$(includedir)/pl/%)
+
+clean-pl/math:
+	rm -f $(pl/math-files)
+
+.PHONY: all-pl/math check-pl/math-test check-pl/math-rtest check-pl/math-ulp check-pl/math install-pl/math clean-pl/math
diff --git a/pl/math/acosh_3u.c b/pl/math/acosh_3u.c
new file mode 100644
index 0000000..4e2cb67
--- /dev/null
+++ b/pl/math/acosh_3u.c
@@ -0,0 +1,66 @@
+/*
+ * Double-precision acosh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define Ln2 (0x1.62e42fefa39efp-1)
+#define MinusZero (0x8000000000000000)
+#define SquareLim (0x5fe0000000000000) /* asuint64(0x1.0p511).  */
+#define Two (0x4000000000000000)       /* asuint64(2.0).  */
+
+double
+optr_aor_log_f64 (double);
+
+double
+log1p (double);
+
+/* acosh approximation using a variety of approaches on different intervals:
+
+   acosh(x) = ln(x + sqrt(x * x - 1)).
+
+   x >= 2^511: We cannot square x without overflow. For huge x, sqrt(x*x - 1) is
+   close enough to x that we can calculate the result by ln(2x) == ln(x) +
+   ln(2). The greatest observed error in this region is 0.98 ULP:
+   acosh(0x1.1b9bf42923d1dp+853) got 0x1.28066a11a7c7fp+9
+				want 0x1.28066a11a7c8p+9.
+
+   x > 2: Calculate the result directly using definition of acosh(x). Greatest
+   observed error in this region is 1.33 ULP:
+   acosh(0x1.1e45d14bfcfa2p+1) got 0x1.71a06f50c34b5p+0
+			      want 0x1.71a06f50c34b6p+0.
+
+   0 <= x <= 2: Calculate the result using log1p. For x < 1, acosh(x) is
+   undefined. For 1 <= x <= 2, the largest observed error is 2.69 ULP:
+   acosh(0x1.073528248093p+0) got 0x1.e4d9bd20684f3p-3
+			     want 0x1.e4d9bd20684f6p-3.  */
+double
+acosh (double x)
+{
+  uint64_t ix = asuint64 (x);
+
+  if (unlikely (ix >= MinusZero))
+    return __math_invalid (x);
+
+  if (unlikely (ix >= SquareLim))
+    return optr_aor_log_f64 (x) + Ln2;
+
+  if (ix >= Two)
+    return optr_aor_log_f64 (x + sqrt (x * x - 1));
+
+  double xm1 = x - 1;
+  return log1p (xm1 + sqrt (2 * xm1 + xm1 * xm1));
+}
+
+PL_SIG (S, D, 1, acosh, 1.0, 10.0)
+PL_TEST_ULP (acosh, 2.19)
+PL_TEST_INTERVAL (acosh, 0, 1, 10000)
+PL_TEST_INTERVAL (acosh, 1, 2, 100000)
+PL_TEST_INTERVAL (acosh, 2, 0x1p511, 100000)
+PL_TEST_INTERVAL (acosh, 0x1p511, inf, 100000)
+PL_TEST_INTERVAL (acosh, -0, -inf, 10000)
diff --git a/pl/math/acoshf_2u8.c b/pl/math/acoshf_2u8.c
new file mode 100644
index 0000000..c9cded7
--- /dev/null
+++ b/pl/math/acoshf_2u8.c
@@ -0,0 +1,63 @@
+/*
+ * Single-precision acosh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define Ln2 (0x1.62e4p-1f)
+#define MinusZero 0x80000000
+#define SquareLim 0x5f800000 /* asuint(0x1p64).  */
+#define Two 0x40000000
+
+/* Single-precision log from math/.  */
+float
+optr_aor_log_f32 (float);
+
+/* Single-precision log(1+x) from pl/math.  */
+float
+log1pf (float);
+
+/* acoshf approximation using a variety of approaches on different intervals:
+
+   x >= 2^64: We cannot square x without overflow. For huge x, sqrt(x*x - 1) is
+   close enough to x that we can calculate the result by ln(2x) == ln(x) +
+   ln(2). The greatest error in the region is 0.94 ULP:
+   acoshf(0x1.15f706p+92) got 0x1.022e14p+6 want 0x1.022e16p+6.
+
+   x > 2: Calculate the result directly using definition of asinh(x) = ln(x +
+   sqrt(x*x - 1)). Greatest error in this region is 1.30 ULP:
+   acoshf(0x1.249d8p+1) got 0x1.77e1aep+0 want 0x1.77e1bp+0.
+
+   0 <= x <= 2: Calculate the result using log1p. For x < 1, acosh(x) is
+   undefined. For 1 <= x <= 2, the greatest error is 2.78 ULP:
+   acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3 want 0x1.ef9ea2p-3.  */
+float
+acoshf (float x)
+{
+  uint32_t ix = asuint (x);
+
+  if (unlikely (ix >= MinusZero))
+    return __math_invalidf (x);
+
+  if (unlikely (ix >= SquareLim))
+    return optr_aor_log_f32 (x) + Ln2;
+
+  if (ix > Two)
+    return optr_aor_log_f32 (x + sqrtf (x * x - 1));
+
+  float xm1 = x - 1;
+  return log1pf (xm1 + sqrtf (2 * xm1 + xm1 * xm1));
+}
+
+PL_SIG (S, F, 1, acosh, 1.0, 10.0)
+PL_TEST_ULP (acoshf, 2.30)
+PL_TEST_INTERVAL (acoshf, 0, 1, 100)
+PL_TEST_INTERVAL (acoshf, 1, 2, 10000)
+PL_TEST_INTERVAL (acoshf, 2, 0x1p64, 100000)
+PL_TEST_INTERVAL (acoshf, 0x1p64, inf, 100000)
+PL_TEST_INTERVAL (acoshf, -0, -inf, 10000)
diff --git a/pl/math/asinh_2u5.c b/pl/math/asinh_2u5.c
new file mode 100644
index 0000000..f167955
--- /dev/null
+++ b/pl/math/asinh_2u5.c
@@ -0,0 +1,86 @@
+/*
+ * Double-precision asinh(x) function
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "estrin.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define ExpM26 0x3e50000000000000 /* asuint64(0x1.0p-26).  */
+#define One 0x3ff0000000000000	  /* asuint64(1.0).  */
+#define Exp511 0x5fe0000000000000 /* asuint64(0x1.0p511).  */
+#define Ln2 0x1.62e42fefa39efp-1
+
+double
+optr_aor_log_f64 (double);
+
+/* Scalar double-precision asinh implementation. This routine uses different
+   approaches on different intervals:
+
+   |x| < 2^-26: Return x. Function is exact in this region.
+
+   |x| < 1: Use custom order-17 polynomial. This is least accurate close to 1.
+     The largest observed error in this region is 1.47 ULPs:
+     asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
+				want 0x1.c1d6bf874019cp-1.
+
+   |x| < 2^511: Upper bound of this region is close to sqrt(DBL_MAX). Calculate
+     the result directly using the definition asinh(x) = ln(x + sqrt(x*x + 1)).
+     The largest observed error in this region is 2.03 ULPs:
+     asinh(-0x1.00094e0f39574p+0) got -0x1.c3508eb6a681ep-1
+				 want -0x1.c3508eb6a682p-1.
+
+   |x| >= 2^511: We cannot square x without overflow at a low
+     cost. At very large x, asinh(x) ~= ln(2x). At huge x we cannot
+     even double x without overflow, so calculate this as ln(x) +
+     ln(2). The largest observed error in this region is 0.98 ULPs at many
+     values, for instance:
+     asinh(0x1.5255a4cf10319p+975) got 0x1.52652f4cb26cbp+9
+				  want 0x1.52652f4cb26ccp+9.  */
+double
+asinh (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t ia = ix & AbsMask;
+  double ax = asdouble (ia);
+  uint64_t sign = ix & ~AbsMask;
+
+  if (ia < ExpM26)
+    {
+      return x;
+    }
+
+  if (ia < One)
+    {
+      double x2 = x * x;
+      double z2 = x2 * x2;
+      double z4 = z2 * z2;
+      double z8 = z4 * z4;
+#define C(i) __asinh_data.poly[i]
+      double p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C);
+      double y = fma (p, x2 * ax, ax);
+      return asdouble (asuint64 (y) | sign);
+    }
+
+  if (unlikely (ia >= Exp511))
+    {
+      return asdouble (asuint64 (optr_aor_log_f64 (ax) + Ln2) | sign);
+    }
+
+  return asdouble (asuint64 (optr_aor_log_f64 (ax + sqrt (ax * ax + 1)))
+		   | sign);
+}
+
+PL_SIG (S, D, 1, asinh, -10.0, 10.0)
+PL_TEST_ULP (asinh, 1.54)
+PL_TEST_INTERVAL (asinh, -0x1p-26, 0x1p-26, 50000)
+PL_TEST_INTERVAL (asinh, 0x1p-26, 1.0, 40000)
+PL_TEST_INTERVAL (asinh, -0x1p-26, -1.0, 10000)
+PL_TEST_INTERVAL (asinh, 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (asinh, -1.0, -100.0, 10000)
+PL_TEST_INTERVAL (asinh, 100.0, inf, 50000)
+PL_TEST_INTERVAL (asinh, -100.0, -inf, 10000)
diff --git a/pl/math/asinh_data.c b/pl/math/asinh_data.c
new file mode 100644
index 0000000..073b197
--- /dev/null
+++ b/pl/math/asinh_data.c
@@ -0,0 +1,22 @@
+/*
+ * Double-precision polynomial coefficients for scalar asinh(x)
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* asinh(x) is odd, and the first term of the Taylor expansion is x, so we can
+   approximate the function by x + x^3 * P(x^2), where P(z) has the form:
+   C0 + C1 * z + C2 * z^2 + C3 * z^3 + ...
+   Note P is evaluated on even powers of x only. See tools/asinh.sollya for the
+   algorithm used to generate these coefficients.  */
+const struct asinh_data __asinh_data
+  = {.poly
+     = {-0x1.55555555554a7p-3, 0x1.3333333326c7p-4, -0x1.6db6db68332e6p-5,
+	0x1.f1c71b26fb40dp-6, -0x1.6e8b8b654a621p-6, 0x1.1c4daa9e67871p-6,
+	-0x1.c9871d10885afp-7, 0x1.7a16e8d9d2ecfp-7, -0x1.3ddca533e9f54p-7,
+	0x1.0becef748dafcp-7, -0x1.b90c7099dd397p-8, 0x1.541f2bb1ffe51p-8,
+	-0x1.d217026a669ecp-9, 0x1.0b5c7977aaf7p-9, -0x1.e0f37daef9127p-11,
+	0x1.388b5fe542a6p-12, -0x1.021a48685e287p-14, 0x1.93d4ba83d34dap-18}};
diff --git a/pl/math/asinhf_3u5.c b/pl/math/asinhf_3u5.c
new file mode 100644
index 0000000..2b2c55d
--- /dev/null
+++ b/pl/math/asinhf_3u5.c
@@ -0,0 +1,78 @@
+/*
+ * Single-precision asinh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "estrinf.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask (0x7fffffff)
+#define SqrtFltMax (0x1.749e96p+10f)
+#define Ln2 (0x1.62e4p-1f)
+#define One (0x3f8)
+#define ExpM12 (0x398)
+
+#define C(i) __asinhf_data.coeffs[i]
+
+float
+optr_aor_log_f32 (float);
+
+/* asinhf approximation using a variety of approaches on different intervals:
+
+   |x| < 2^-12: Return x. Function is exactly rounded in this region.
+
+   |x| < 1.0: Use custom order-8 polynomial. The largest observed
+     error in this region is 1.3ulps:
+     asinhf(0x1.f0f74cp-1) got 0x1.b88de4p-1 want 0x1.b88de2p-1.
+
+   |x| <= SqrtFltMax: Calculate the result directly using the
+     definition of asinh(x) = ln(x + sqrt(x*x + 1)). The largest
+     observed error in this region is 1.99ulps.
+     asinhf(0x1.00e358p+0) got 0x1.c4849ep-1 want 0x1.c484a2p-1.
+
+   |x| > SqrtFltMax: We cannot square x without overflow at a low
+     cost. At very large x, asinh(x) ~= ln(2x). At huge x we cannot
+     even double x without overflow, so calculate this as ln(x) +
+     ln(2). This largest observed error in this region is 3.39ulps.
+     asinhf(0x1.749e9ep+10) got 0x1.fffff8p+2 want 0x1.fffffep+2.  */
+float
+asinhf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t ia = ix & AbsMask;
+  uint32_t ia12 = ia >> 20;
+  float ax = asfloat (ia);
+  uint32_t sign = ix & ~AbsMask;
+
+  if (unlikely (ia12 < ExpM12 || ia == 0x7f800000))
+    return x;
+
+  if (unlikely (ia12 >= 0x7f8))
+    return __math_invalidf (x);
+
+  if (ia12 < One)
+    {
+      float x2 = ax * ax;
+      float p = ESTRIN_7 (ax, x2, x2 * x2, C);
+      float y = fmaf (x2, p, ax);
+      return asfloat (asuint (y) | sign);
+    }
+
+  if (unlikely (ax > SqrtFltMax))
+    {
+      return asfloat (asuint (optr_aor_log_f32 (ax) + Ln2) | sign);
+    }
+
+  return asfloat (asuint (optr_aor_log_f32 (ax + sqrtf (ax * ax + 1))) | sign);
+}
+
+PL_SIG (S, F, 1, asinh, -10.0, 10.0)
+PL_TEST_ULP (asinhf, 2.9)
+PL_TEST_INTERVAL (asinhf, 0, 0x1p-12, 5000)
+PL_TEST_INTERVAL (asinhf, 0x1p-12, 1.0, 50000)
+PL_TEST_INTERVAL (asinhf, 1.0, 0x1p11, 50000)
+PL_TEST_INTERVAL (asinhf, 0x1p11, 0x1p127, 20000)
diff --git a/pl/math/asinhf_data.c b/pl/math/asinhf_data.c
new file mode 100644
index 0000000..cd1ef16
--- /dev/null
+++ b/pl/math/asinhf_data.c
@@ -0,0 +1,15 @@
+/*
+ * Coefficients for single-precision asinh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Approximate asinhf(x) directly in [2^-12, 1]. See for tools/asinhf.sollya for
+   these coeffs were generated.  */
+const struct asinhf_data __asinhf_data
+  = {.coeffs
+     = {-0x1.9b16fap-19f, -0x1.552baap-3f, -0x1.4e572ap-11f, 0x1.3a81dcp-4f,
+	0x1.65bbaap-10f, -0x1.057f1p-4f, 0x1.6c1d46p-5f, -0x1.4cafe8p-7f}};
diff --git a/pl/math/atan2_2u5.c b/pl/math/atan2_2u5.c
new file mode 100644
index 0000000..c909ac9
--- /dev/null
+++ b/pl/math/atan2_2u5.c
@@ -0,0 +1,159 @@
+/*
+ * Double-precision scalar atan2(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include <stdbool.h>
+
+#include "atan_common.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define Pi (0x1.921fb54442d18p+1)
+#define PiOver2 (0x1.921fb54442d18p+0)
+#define PiOver4 (0x1.921fb54442d18p-1)
+#define SignMask (0x8000000000000000)
+#define ExpMask (0x7ff0000000000000)
+
+/* We calculate atan2 by P(n/d), where n and d are similar to the input
+   arguments, and P is a polynomial. Evaluating P(x) requires calculating x^8,
+   which may underflow if n and d have very different magnitude.
+   POW8_EXP_UFLOW_BOUND is the lower bound of the difference in exponents of n
+   and d for which P underflows, and is used to special-case such inputs.  */
+#define POW8_EXP_UFLOW_BOUND 62
+
+static inline int64_t
+biased_exponent (double f)
+{
+  uint64_t fi = asuint64 (f);
+  return (fi & ExpMask) >> 52;
+}
+
+/* Fast implementation of scalar atan2. Largest errors are when y and x are
+   close together. The greatest observed error is 2.28 ULP:
+   atan2(-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732)
+   got -0x1.954f42f1fa841p-1 want -0x1.954f42f1fa843p-1.  */
+double
+atan2 (double y, double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t iy = asuint64 (y);
+
+  uint64_t sign_x = ix & SignMask;
+  uint64_t sign_y = iy & SignMask;
+
+  uint64_t iax = ix & ~SignMask;
+  uint64_t iay = iy & ~SignMask;
+
+  bool xisnan = isnan (x);
+  if (unlikely (isnan (y) && !xisnan))
+    return __math_invalid (y);
+  if (unlikely (xisnan))
+    return __math_invalid (x);
+
+  /* m = 2 * sign(x) + sign(y).  */
+  uint32_t m = ((iy >> 63) & 1) | ((ix >> 62) & 2);
+
+  int64_t exp_diff = biased_exponent (x) - biased_exponent (y);
+
+  /* y = 0.  */
+  if (iay == 0)
+    {
+      switch (m)
+	{
+	case 0:
+	case 1:
+	  return y; /* atan(+-0,+anything)=+-0.  */
+	case 2:
+	  return Pi; /* atan(+0,-anything) = pi.  */
+	case 3:
+	  return -Pi; /* atan(-0,-anything) =-pi.  */
+	}
+    }
+  /* Special case for (x, y) either on or very close to the y axis. Either x =
+     0, or y is much larger than x (difference in exponents >=
+     POW8_EXP_UFLOW_BOUND).  */
+  if (unlikely (iax == 0 || exp_diff <= -POW8_EXP_UFLOW_BOUND))
+    return sign_y ? -PiOver2 : PiOver2;
+
+  /* Special case for either x is INF or (x, y) is very close to x axis and x is
+     negative.  */
+  if (unlikely (iax == 0x7ff0000000000000
+		|| (exp_diff >= POW8_EXP_UFLOW_BOUND && m >= 2)))
+    {
+      if (iay == 0x7ff0000000000000)
+	{
+	  switch (m)
+	    {
+	    case 0:
+	      return PiOver4; /* atan(+INF,+INF).  */
+	    case 1:
+	      return -PiOver4; /* atan(-INF,+INF).  */
+	    case 2:
+	      return 3.0 * PiOver4; /* atan(+INF,-INF).  */
+	    case 3:
+	      return -3.0 * PiOver4; /* atan(-INF,-INF).  */
+	    }
+	}
+      else
+	{
+	  switch (m)
+	    {
+	    case 0:
+	      return 0.0; /* atan(+...,+INF).  */
+	    case 1:
+	      return -0.0; /* atan(-...,+INF).  */
+	    case 2:
+	      return Pi; /* atan(+...,-INF).  */
+	    case 3:
+	      return -Pi; /* atan(-...,-INF).  */
+	    }
+	}
+    }
+  /* y is INF.  */
+  if (iay == 0x7ff0000000000000)
+    return sign_y ? -PiOver2 : PiOver2;
+
+  uint64_t sign_xy = sign_x ^ sign_y;
+
+  double ax = asdouble (iax);
+  double ay = asdouble (iay);
+  uint64_t pred_aygtax = (ay > ax);
+
+  /* Set up z for call to atan.  */
+  double n = pred_aygtax ? -ax : ay;
+  double d = pred_aygtax ? ay : ax;
+  double z = n / d;
+
+  double ret;
+  if (unlikely (m < 2 && exp_diff >= POW8_EXP_UFLOW_BOUND))
+    {
+      /* If (x, y) is very close to x axis and x is positive, the polynomial
+	 will underflow and evaluate to z.  */
+      ret = z;
+    }
+  else
+    {
+      /* Work out the correct shift.  */
+      double shift = sign_x ? -2.0 : 0.0;
+      shift = pred_aygtax ? shift + 1.0 : shift;
+      shift *= PiOver2;
+
+      ret = eval_poly (z, z, shift);
+    }
+
+  /* Account for the sign of x and y.  */
+  return asdouble (asuint64 (ret) ^ sign_xy);
+}
+
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
+PL_SIG (S, D, 2, atan2)
+PL_TEST_ULP (atan2, 1.78)
+PL_TEST_INTERVAL (atan2, -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (atan2, -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (atan2, 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (atan2, 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (atan2, 1e6, 1e32, 40000)
diff --git a/pl/math/atan2f_3u.c b/pl/math/atan2f_3u.c
new file mode 100644
index 0000000..38e1df5
--- /dev/null
+++ b/pl/math/atan2f_3u.c
@@ -0,0 +1,167 @@
+/*
+ * Single-precision scalar atan2(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include <stdbool.h>
+
+#include "atanf_common.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define Pi (0x1.921fb6p+1f)
+#define PiOver2 (0x1.921fb6p+0f)
+#define PiOver4 (0x1.921fb6p-1f)
+#define SignMask (0x80000000)
+
+/* We calculate atan2f by P(n/d), where n and d are similar to the input
+   arguments, and P is a polynomial. The polynomial may underflow.
+   POLY_UFLOW_BOUND is the lower bound of the difference in exponents of n and d
+   for which P underflows, and is used to special-case such inputs.  */
+#define POLY_UFLOW_BOUND 24
+
+static inline int32_t
+biased_exponent (float f)
+{
+  uint32_t fi = asuint (f);
+  int32_t ex = (int32_t) ((fi & 0x7f800000) >> 23);
+  if (unlikely (ex == 0))
+    {
+      /* Subnormal case - we still need to get the exponent right for subnormal
+	 numbers as division may take us back inside the normal range.  */
+      return ex - __builtin_clz (fi << 9);
+    }
+  return ex;
+}
+
+/* Fast implementation of scalar atan2f. Largest observed error is
+   2.88ulps in [99.0, 101.0] x [99.0, 101.0]:
+   atan2f(0x1.9332d8p+6, 0x1.8cb6c4p+6) got 0x1.964646p-1
+				       want 0x1.964640p-1.  */
+float
+atan2f (float y, float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t iy = asuint (y);
+
+  uint32_t sign_x = ix & SignMask;
+  uint32_t sign_y = iy & SignMask;
+
+  uint32_t iax = ix & ~SignMask;
+  uint32_t iay = iy & ~SignMask;
+
+  /* x or y is NaN.  */
+  if ((iax > 0x7f800000) || (iay > 0x7f800000))
+    return x + y;
+
+  /* m = 2 * sign(x) + sign(y).  */
+  uint32_t m = ((iy >> 31) & 1) | ((ix >> 30) & 2);
+
+  /* The following follows glibc ieee754 implementation, except
+     that we do not use +-tiny shifts (non-nearest rounding mode).  */
+
+  int32_t exp_diff = biased_exponent (x) - biased_exponent (y);
+
+  /* Special case for (x, y) either on or very close to the x axis. Either y =
+     0, or y is tiny and x is huge (difference in exponents >=
+     POLY_UFLOW_BOUND). In the second case, we only want to use this special
+     case when x is negative (i.e. quadrants 2 or 3).  */
+  if (unlikely (iay == 0 || (exp_diff >= POLY_UFLOW_BOUND && m >= 2)))
+    {
+      switch (m)
+	{
+	case 0:
+	case 1:
+	  return y; /* atan(+-0,+anything)=+-0.  */
+	case 2:
+	  return Pi; /* atan(+0,-anything) = pi.  */
+	case 3:
+	  return -Pi; /* atan(-0,-anything) =-pi.  */
+	}
+    }
+  /* Special case for (x, y) either on or very close to the y axis. Either x =
+     0, or x is tiny and y is huge (difference in exponents >=
+     POLY_UFLOW_BOUND).  */
+  if (unlikely (iax == 0 || exp_diff <= -POLY_UFLOW_BOUND))
+    return sign_y ? -PiOver2 : PiOver2;
+
+  /* x is INF.  */
+  if (iax == 0x7f800000)
+    {
+      if (iay == 0x7f800000)
+	{
+	  switch (m)
+	    {
+	    case 0:
+	      return PiOver4; /* atan(+INF,+INF).  */
+	    case 1:
+	      return -PiOver4; /* atan(-INF,+INF).  */
+	    case 2:
+	      return 3.0f * PiOver4; /* atan(+INF,-INF).  */
+	    case 3:
+	      return -3.0f * PiOver4; /* atan(-INF,-INF).  */
+	    }
+	}
+      else
+	{
+	  switch (m)
+	    {
+	    case 0:
+	      return 0.0f; /* atan(+...,+INF).  */
+	    case 1:
+	      return -0.0f; /* atan(-...,+INF).  */
+	    case 2:
+	      return Pi; /* atan(+...,-INF).  */
+	    case 3:
+	      return -Pi; /* atan(-...,-INF).  */
+	    }
+	}
+    }
+  /* y is INF.  */
+  if (iay == 0x7f800000)
+    return sign_y ? -PiOver2 : PiOver2;
+
+  uint32_t sign_xy = sign_x ^ sign_y;
+
+  float ax = asfloat (iax);
+  float ay = asfloat (iay);
+
+  bool pred_aygtax = (ay > ax);
+
+  /* Set up z for call to atanf.  */
+  float n = pred_aygtax ? -ax : ay;
+  float d = pred_aygtax ? ay : ax;
+  float z = n / d;
+
+  float ret;
+  if (unlikely (m < 2 && exp_diff >= POLY_UFLOW_BOUND))
+    {
+      /* If (x, y) is very close to x axis and x is positive, the polynomial
+	 will underflow and evaluate to z.  */
+      ret = z;
+    }
+  else
+    {
+      /* Work out the correct shift.  */
+      float shift = sign_x ? -2.0f : 0.0f;
+      shift = pred_aygtax ? shift + 1.0f : shift;
+      shift *= PiOver2;
+
+      ret = eval_poly (z, z, shift);
+    }
+
+  /* Account for the sign of x and y.  */
+  return asfloat (asuint (ret) ^ sign_xy);
+}
+
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
+PL_SIG (S, F, 2, atan2)
+PL_TEST_ULP (atan2f, 2.4)
+PL_TEST_INTERVAL (atan2f, -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (atan2f, -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (atan2f, 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (atan2f, 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (atan2f, 1e6, 1e32, 40000)
diff --git a/pl/math/atan_2u5.c b/pl/math/atan_2u5.c
new file mode 100644
index 0000000..ee47701
--- /dev/null
+++ b/pl/math/atan_2u5.c
@@ -0,0 +1,73 @@
+/*
+ * Double-precision atan(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "pl_sig.h"
+#include "pl_test.h"
+#include "atan_common.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define PiOver2 0x1.921fb54442d18p+0
+#define TinyBound 0x3e1 /* top12(asuint64(0x1p-30)).  */
+#define BigBound 0x434	/* top12(asuint64(0x1p53)).  */
+#define OneTop 0x3ff
+
+/* Fast implementation of double-precision atan.
+   Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
+   z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps:
+   atan(0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
+			     want 0x1.9225645bdd7c3p-1.  */
+double
+atan (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t sign = ix & ~AbsMask;
+  uint64_t ia = ix & AbsMask;
+  uint32_t ia12 = ia >> 52;
+
+  if (unlikely (ia12 >= BigBound || ia12 < TinyBound))
+    {
+      if (ia12 < TinyBound)
+	/* Avoid underflow by returning x.  */
+	return x;
+      if (ia > 0x7ff0000000000000)
+	/* Propagate NaN.  */
+	return __math_invalid (x);
+      /* atan(x) rounds to PiOver2 for large x.  */
+      return asdouble (asuint64 (PiOver2) ^ sign);
+    }
+
+  double z, az, shift;
+  if (ia12 >= OneTop)
+    {
+      /* For x > 1, use atan(x) = pi / 2 + atan(-1 / x).  */
+      z = -1.0 / x;
+      shift = PiOver2;
+      /* Use absolute value only when needed (odd powers of z).  */
+      az = -fabs (z);
+    }
+  else
+    {
+      /* For x < 1, approximate atan(x) directly.  */
+      z = x;
+      shift = 0;
+      az = asdouble (ia);
+    }
+
+  /* Calculate polynomial, shift + z + z^3 * P(z^2).  */
+  double y = eval_poly (z, az, shift);
+  /* Copy sign.  */
+  return asdouble (asuint64 (y) ^ sign);
+}
+
+PL_SIG (S, D, 1, atan, -10.0, 10.0)
+PL_TEST_ULP (atan, 1.78)
+PL_TEST_INTERVAL (atan, 0, 0x1p-30, 10000)
+PL_TEST_INTERVAL (atan, -0, -0x1p-30, 1000)
+PL_TEST_INTERVAL (atan, 0x1p-30, 0x1p53, 900000)
+PL_TEST_INTERVAL (atan, -0x1p-30, -0x1p53, 90000)
+PL_TEST_INTERVAL (atan, 0x1p53, inf, 10000)
+PL_TEST_INTERVAL (atan, -0x1p53, -inf, 1000)
diff --git a/pl/math/atan_common.h b/pl/math/atan_common.h
new file mode 100644
index 0000000..da0da64
--- /dev/null
+++ b/pl/math/atan_common.h
@@ -0,0 +1,49 @@
+/*
+ * Double-precision polynomial evaluation function for scalar and vector atan(x)
+ * and atan2(y,x).
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "estrin.h"
+
+#if V_SUPPORTED
+
+#include "v_math.h"
+
+#define DBL_T v_f64_t
+#define P(i) v_f64 (__atan_poly_data.poly[i])
+
+#else
+
+#define DBL_T double
+#define P(i) __atan_poly_data.poly[i]
+
+#endif
+
+/* Polynomial used in fast atan(x) and atan2(y,x) implementations
+   The order 19 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2).  */
+static inline DBL_T
+eval_poly (DBL_T z, DBL_T az, DBL_T shift)
+{
+  /* Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
+     full scheme to avoid underflow in x^16.  */
+  DBL_T z2 = z * z;
+  DBL_T x2 = z2 * z2;
+  DBL_T x4 = x2 * x2;
+  DBL_T x8 = x4 * x4;
+  DBL_T y
+    = FMA (ESTRIN_11_ (z2, x2, x4, x8, P, 8), x8, ESTRIN_7 (z2, x2, x4, P));
+
+  /* Finalize. y = shift + z + z^3 * P(z^2).  */
+  y = FMA (y, z2 * az, az);
+  y = y + shift;
+
+  return y;
+}
+
+#undef DBL_T
+#undef FMA
+#undef P
diff --git a/pl/math/atan_data.c b/pl/math/atan_data.c
new file mode 100644
index 0000000..91d0f61
--- /dev/null
+++ b/pl/math/atan_data.c
@@ -0,0 +1,20 @@
+/*
+ * Double-precision polynomial coefficients for vector atan(x) and atan2(y,x).
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct atan_poly_data __atan_poly_data = {
+  .poly = {/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+	      [2**-1022, 1.0]. See atan.sollya for details of how these were
+	      generated.  */
+	   -0x1.5555555555555p-2,  0x1.99999999996c1p-3,  -0x1.2492492478f88p-3,
+	   0x1.c71c71bc3951cp-4,   -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4,
+	   -0x1.11100ee084227p-4,  0x1.e1d0f9696f63bp-5,  -0x1.aebfe7b418581p-5,
+	   0x1.842dbe9b0d916p-5,   -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5,
+	   -0x1.00e6eece7de8p-5,   0x1.860897b29e5efp-6,  -0x1.0051381722a59p-6,
+	   0x1.14e9dc19a4a4ep-7,   -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10,
+	   -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16}};
diff --git a/pl/math/atanf_2u9.c b/pl/math/atanf_2u9.c
new file mode 100644
index 0000000..9d17f25
--- /dev/null
+++ b/pl/math/atanf_2u9.c
@@ -0,0 +1,76 @@
+/*
+ * Single-precision atan(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "atanf_common.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define PiOver2 0x1.921fb6p+0f
+#define AbsMask 0x7fffffff
+#define TinyBound 0x30800000 /* asuint(0x1p-30).  */
+#define BigBound 0x4e800000  /* asuint(0x1p30).  */
+#define One 0x3f800000
+
+/* Approximation of single-precision atan(x) based on
+   atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
+   using z=-1/x and shift = pi/2.
+   Maximum error is 2.88 ulps:
+   atanf(0x1.0565ccp+0) got 0x1.97771p-1
+		       want 0x1.97770ap-1.  */
+float
+atanf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t sign = ix & ~AbsMask;
+  uint32_t ia = ix & AbsMask;
+
+  if (unlikely (ia < TinyBound))
+    /* Avoid underflow by returning x.  */
+    return x;
+
+  if (unlikely (ia > BigBound))
+    {
+      if (ia > 0x7f800000)
+	/* Propagate NaN.  */
+	return __math_invalidf (x);
+      /* atan(x) rounds to PiOver2 for large x.  */
+      return asfloat (asuint (PiOver2) ^ sign);
+    }
+
+  float z, az, shift;
+  if (ia > One)
+    {
+      /* For x > 1, use atan(x) = pi / 2 + atan(-1 / x).  */
+      z = -1.0f / x;
+      shift = PiOver2;
+      /* Use absolute value only when needed (odd powers of z).  */
+      az = -fabsf (z);
+    }
+  else
+    {
+      /* For x < 1, approximate atan(x) directly.  */
+      z = x;
+      az = asfloat (ia);
+      shift = 0;
+    }
+
+  /* Calculate polynomial, shift + z + z^3 * P(z^2).  */
+  float y = eval_poly (z, az, shift);
+  /* Copy sign.  */
+  return asfloat (asuint (y) ^ sign);
+}
+
+PL_SIG (S, F, 1, atan, -10.0, 10.0)
+PL_TEST_ULP (atanf, 2.38)
+PL_TEST_INTERVAL (atanf, 0, 0x1p-30, 5000)
+PL_TEST_INTERVAL (atanf, -0, -0x1p-30, 5000)
+PL_TEST_INTERVAL (atanf, 0x1p-30, 1, 40000)
+PL_TEST_INTERVAL (atanf, -0x1p-30, -1, 40000)
+PL_TEST_INTERVAL (atanf, 1, 0x1p30, 40000)
+PL_TEST_INTERVAL (atanf, -1, -0x1p30, 40000)
+PL_TEST_INTERVAL (atanf, 0x1p30, inf, 1000)
+PL_TEST_INTERVAL (atanf, -0x1p30, -inf, 1000)
diff --git a/pl/math/atanf_common.h b/pl/math/atanf_common.h
new file mode 100644
index 0000000..37ca76d
--- /dev/null
+++ b/pl/math/atanf_common.h
@@ -0,0 +1,51 @@
+/*
+ * Single-precision polynomial evaluation function for scalar and vector
+ * atan(x) and atan2(y,x).
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_ATANF_COMMON_H
+#define PL_MATH_ATANF_COMMON_H
+
+#include "math_config.h"
+#include "estrinf.h"
+
+#if V_SUPPORTED
+
+#include "v_math.h"
+
+#define FLT_T v_f32_t
+#define P(i) v_f32 (__atanf_poly_data.poly[i])
+
+#else
+
+#define FLT_T float
+#define P(i) __atanf_poly_data.poly[i]
+
+#endif
+
+/* Polynomial used in fast atanf(x) and atan2f(y,x) implementations
+   The order 7 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2).  */
+static inline FLT_T
+eval_poly (FLT_T z, FLT_T az, FLT_T shift)
+{
+  /* Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
+     a standard implementation using z8 creates spurious underflow
+     in the very last fma (when z^8 is small enough).
+     Therefore, we split the last fma into a mul and and an fma.
+     Horner and single-level Estrin have higher errors that exceed
+     threshold.  */
+  FLT_T z2 = z * z;
+  FLT_T z4 = z2 * z2;
+
+  /* Then assemble polynomial.  */
+  FLT_T y = FMA (z4, z4 * ESTRIN_3_ (z2, z4, P, 4), ESTRIN_3 (z2, z4, P));
+
+  /* Finalize:
+     y = shift + z * P(z^2).  */
+  return FMA (y, z2 * az, az) + shift;
+}
+
+#endif // PL_MATH_ATANF_COMMON_H
diff --git a/pl/math/atanf_data.c b/pl/math/atanf_data.c
new file mode 100644
index 0000000..c4cba23
--- /dev/null
+++ b/pl/math/atanf_data.c
@@ -0,0 +1,15 @@
+/*
+ * Single-precision polynomial coefficients for vector atan(x) and atan2(y,x).
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-128, 1.0].
+ */
+const struct atanf_poly_data __atanf_poly_data = {
+  .poly = {/* See atanf.sollya for details of how these were generated.  */
+	   -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f,
+	   -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f}};
diff --git a/pl/math/atanh_3u.c b/pl/math/atanh_3u.c
new file mode 100644
index 0000000..a168cd5
--- /dev/null
+++ b/pl/math/atanh_3u.c
@@ -0,0 +1,86 @@
+/*
+ * Double-precision atanh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "estrin.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define Half 0x3fe0000000000000
+#define One 0x3ff0000000000000
+#define Ln2Hi 0x1.62e42fefa3800p-1
+#define Ln2Lo 0x1.ef35793c76730p-45
+#define OneMHfRt2Top                                                           \
+  0x00095f62 /* top32(asuint64(1)) - top32(asuint64(sqrt(2)/2)).  */
+#define OneTop12 0x3ff
+#define HfRt2Top 0x3fe6a09e /* top32(asuint64(sqrt(2)/2)).  */
+#define BottomMask 0xffffffff
+#define C(i) __log1p_data.coeffs[i]
+
+static inline double
+log1p_inline (double x)
+{
+  /* Helper for calculating log(1 + x) using order-18 polynomial on a reduced
+     interval. Copied from log1p_2u.c, with no special-case handling. See that
+     file for details of the algorithm.  */
+  double m = x + 1;
+  uint64_t mi = asuint64 (m);
+
+  /* Decompose x + 1 into (f + 1) * 2^k, with k chosen such that f is in
+     [sqrt(2)/2, sqrt(2)].  */
+  uint32_t u = (mi >> 32) + OneMHfRt2Top;
+  int32_t k = (int32_t) (u >> 20) - OneTop12;
+  uint32_t utop = (u & 0x000fffff) + HfRt2Top;
+  uint64_t u_red = ((uint64_t) utop << 32) | (mi & BottomMask);
+  double f = asdouble (u_red) - 1;
+
+  /* Correction term for round-off in f.  */
+  double cm = (x - (m - 1)) / m;
+
+  /* Approximate log1p(f) with polynomial.  */
+  double f2 = f * f;
+  double f4 = f2 * f2;
+  double f8 = f4 * f4;
+  double p = fma (f, ESTRIN_18 (f, f2, f4, f8, f8 * f8, C) * f, f);
+
+  /* Recombine log1p(x) = k*log2 + log1p(f) + c/m.  */
+  double kd = k;
+  double y = fma (Ln2Lo, kd, cm);
+  return y + fma (Ln2Hi, kd, p);
+}
+
+/* Approximation for double-precision inverse tanh(x), using a simplified
+   version of log1p. Greatest observed error is 3.00 ULP:
+   atanh(0x1.e58f3c108d714p-4) got 0x1.e7da77672a647p-4
+			      want 0x1.e7da77672a64ap-4.  */
+double
+atanh (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t sign = ix & ~AbsMask;
+  uint64_t ia = ix & AbsMask;
+
+  if (unlikely (ia == One))
+    return __math_divzero (sign >> 32);
+
+  if (unlikely (ia > One))
+    return __math_invalid (x);
+
+  double halfsign = asdouble (Half | sign);
+  double ax = asdouble (ia);
+  return halfsign * log1p_inline ((2 * ax) / (1 - ax));
+}
+
+PL_SIG (S, D, 1, atanh, -1.0, 1.0)
+PL_TEST_ULP (atanh, 3.00)
+PL_TEST_INTERVAL (atanh, 0, 0x1p-23, 10000)
+PL_TEST_INTERVAL (atanh, -0, -0x1p-23, 10000)
+PL_TEST_INTERVAL (atanh, 0x1p-23, 1, 90000)
+PL_TEST_INTERVAL (atanh, -0x1p-23, -1, 90000)
+PL_TEST_INTERVAL (atanh, 1, inf, 100)
+PL_TEST_INTERVAL (atanh, -1, -inf, 100)
diff --git a/pl/math/atanhf_3u1.c b/pl/math/atanhf_3u1.c
new file mode 100644
index 0000000..fb90aa2
--- /dev/null
+++ b/pl/math/atanhf_3u1.c
@@ -0,0 +1,88 @@
+/*
+ * Single-precision atanh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define One 0x3f800000
+#define Four 0x40800000
+#define Ln2 0x1.62e43p-1f
+#define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */
+
+#define C(i) __log1pf_data.coeffs[i]
+
+static inline float
+eval_poly (float m)
+{
+  /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme.  */
+  float p_12 = fmaf (m, C (1), C (0));
+  float p_34 = fmaf (m, C (3), C (2));
+  float p_56 = fmaf (m, C (5), C (4));
+  float p_78 = fmaf (m, C (7), C (6));
+
+  float m2 = m * m;
+  float p_02 = fmaf (m2, p_12, m);
+  float p_36 = fmaf (m2, p_56, p_34);
+  float p_79 = fmaf (m2, C (8), p_78);
+
+  float m4 = m2 * m2;
+  float p_06 = fmaf (m4, p_36, p_02);
+
+  return fmaf (m4 * p_79, m4, p_06);
+}
+
+static inline float
+log1pf_inline (float x)
+{
+  /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
+     special-case handling. See that file for details of the algorithm.  */
+  float m = x + 1.0f;
+  int k = (asuint (m) - 0x3f400000) & 0xff800000;
+  float s = asfloat (Four - k);
+  float m_scale = asfloat (asuint (x) - k) + fmaf (0.25f, s, -1.0f);
+  float p = eval_poly (m_scale);
+  float scale_back = (float) k * 0x1.0p-23f;
+  return fmaf (scale_back, Ln2, p);
+}
+
+/* Approximation for single-precision inverse tanh(x), using a simplified
+   version of log1p. Maximum error is 3.08 ULP:
+   atanhf(0x1.ff0d5p-5) got 0x1.ffb768p-5
+		       want 0x1.ffb76ep-5.  */
+float
+atanhf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t iax = ix & AbsMask;
+  uint32_t sign = ix & ~AbsMask;
+
+  if (unlikely (iax < TinyBound))
+    return x;
+
+  if (iax == One)
+    return __math_divzero (sign);
+
+  if (unlikely (iax > One))
+    return __math_invalidf (x);
+
+  float halfsign = asfloat (Half | sign);
+  float ax = asfloat (iax);
+  return halfsign * log1pf_inline ((2 * ax) / (1 - ax));
+}
+
+PL_SIG (S, F, 1, atanh, -1.0, 1.0)
+PL_TEST_ULP (atanhf, 2.59)
+PL_TEST_INTERVAL (atanhf, 0, 0x1p-12, 500)
+PL_TEST_INTERVAL (atanhf, 0x1p-12, 1, 200000)
+PL_TEST_INTERVAL (atanhf, 1, inf, 1000)
+PL_TEST_INTERVAL (atanhf, -0, -0x1p-12, 500)
+PL_TEST_INTERVAL (atanhf, -0x1p-12, -1, 200000)
+PL_TEST_INTERVAL (atanhf, -1, -inf, 1000)
diff --git a/pl/math/cbrt_2u.c b/pl/math/cbrt_2u.c
new file mode 100644
index 0000000..83715dd
--- /dev/null
+++ b/pl/math/cbrt_2u.c
@@ -0,0 +1,70 @@
+/*
+ * Double-precision cbrt(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+PL_SIG (S, D, 1, cbrt, -10.0, 10.0)
+
+#define AbsMask 0x7fffffffffffffff
+#define TwoThirds 0x1.5555555555555p-1
+
+#define C(i) __cbrt_data.poly[i]
+#define T(i) __cbrt_data.table[i]
+
+/* Approximation for double-precision cbrt(x), using low-order polynomial and
+   two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat
+   according to the exponent, for instance an error observed for double value
+   m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an
+   integer.
+   cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0
+			     want 0x1.965fe72821e99p+0.  */
+double
+cbrt (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t iax = ix & AbsMask;
+  uint64_t sign = ix & ~AbsMask;
+
+  if (unlikely (iax == 0 || iax == 0x7f80000000000000))
+    return x;
+
+  /* |x| = m * 2^e, where m is in [0.5, 1.0].
+     We can easily decompose x into m and e using frexp.  */
+  int e;
+  double m = frexp (asdouble (iax), &e);
+
+  /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for
+     Newton iterations.  */
+  double p_01 = fma (C (1), m, C (0));
+  double p_23 = fma (C (3), m, C (2));
+  double p = fma (p_23, m * m, p_01);
+
+  /* Two iterations of Newton's method for iteratively approximating cbrt.  */
+  double m_by_3 = m / 3;
+  double a = fma (TwoThirds, p, m_by_3 / (p * p));
+  a = fma (TwoThirds, a, m_by_3 / (a * a));
+
+  /* Assemble the result by the following:
+
+     cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+     Let t = (2 ^ (e / 3)) / (2 ^ round(e / 3)).
+
+     Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3.
+     i is an integer in [-2, 2], so t can be looked up in the table T.
+     Hence the result is assembled as:
+
+     cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.
+     Which can be done easily using ldexp.  */
+  return asdouble (asuint64 (ldexp (a * T (2 + e % 3), e / 3)) | sign);
+}
+
+PL_TEST_ULP (cbrt, 1.30)
+PL_TEST_INTERVAL (cbrt, 0, inf, 1000000)
+PL_TEST_INTERVAL (cbrt, -0, -inf, 1000000)
diff --git a/pl/math/cbrt_data.c b/pl/math/cbrt_data.c
new file mode 100644
index 0000000..3d484c2
--- /dev/null
+++ b/pl/math/cbrt_data.c
@@ -0,0 +1,15 @@
+/*
+ * Coefficients and table entries for double-precision cbrt(x).
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct cbrt_data __cbrt_data
+  = {.poly = { /* Coefficients for very rough approximation of cbrt(x) in [0.5, 1].
+                  See cbrt.sollya for details of generation.  */
+	      0x1.c14e8ee44767p-2, 0x1.dd2d3f99e4c0ep-1, -0x1.08e83026b7e74p-1, 0x1.2c74eaa3ba428p-3},
+     .table = { /* table[i] = 2^((i - 2) / 3).  */
+	         0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0, 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0}};
diff --git a/pl/math/cbrtf_1u5.c b/pl/math/cbrtf_1u5.c
new file mode 100644
index 0000000..adc5917
--- /dev/null
+++ b/pl/math/cbrtf_1u5.c
@@ -0,0 +1,67 @@
+/*
+ * Single-precision cbrt(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "estrinf.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask 0x7fffffff
+#define SignMask 0x80000000
+#define TwoThirds 0x1.555556p-1f
+
+#define C(i) __cbrtf_data.poly[i]
+#define T(i) __cbrtf_data.table[i]
+
+/* Approximation for single-precision cbrt(x), using low-order polynomial and
+   one Newton iteration on a reduced interval. Greatest error is 1.5 ULP. This
+   is observed for every value where the mantissa is 0x1.81410e and the exponent
+   is a multiple of 3, for example:
+   cbrtf(0x1.81410ep+30) got 0x1.255d96p+10
+			want 0x1.255d92p+10.  */
+float
+cbrtf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t iax = ix & AbsMask;
+  uint32_t sign = ix & SignMask;
+
+  if (unlikely (iax == 0 || iax == 0x7f800000))
+    return x;
+
+  /* |x| = m * 2^e, where m is in [0.5, 1.0].
+     We can easily decompose x into m and e using frexpf.  */
+  int e;
+  float m = frexpf (asfloat (iax), &e);
+
+  /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
+     the less accurate the next stage of the algorithm needs to be. An order-4
+     polynomial is enough for one Newton iteration.  */
+  float p = ESTRIN_3 (m, m * m, C);
+  /* One iteration of Newton's method for iteratively approximating cbrt.  */
+  float m_by_3 = m / 3;
+  float a = fmaf (TwoThirds, p, m_by_3 / (p * p));
+
+  /* Assemble the result by the following:
+
+     cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+     Let t = (2 ^ (e / 3)) / (2 ^ round(e / 3)).
+
+     Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3.
+     i is an integer in [-2, 2], so t can be looked up in the table T.
+     Hence the result is assembled as:
+
+     cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.
+     Which can be done easily using ldexpf.  */
+  return asfloat (asuint (ldexpf (a * T (2 + e % 3), e / 3)) | sign);
+}
+
+PL_SIG (S, F, 1, cbrt, -10.0, 10.0)
+PL_TEST_ULP (cbrtf, 1.03)
+PL_TEST_INTERVAL (cbrtf, 0, inf, 1000000)
+PL_TEST_INTERVAL (cbrtf, -0, -inf, 1000000)
diff --git a/pl/math/cbrtf_data.c b/pl/math/cbrtf_data.c
new file mode 100644
index 0000000..c6cdb4d
--- /dev/null
+++ b/pl/math/cbrtf_data.c
@@ -0,0 +1,15 @@
+/*
+ * Coefficients and table entries for single-precision cbrt(x).
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct cbrtf_data __cbrtf_data
+  = {.poly = { /* Coefficients for very rough approximation of cbrt(x) in [0.5, 1].
+                  See cbrtf.sollya for details of generation.  */
+	        0x1.c14e96p-2, 0x1.dd2d3p-1, -0x1.08e81ap-1, 0x1.2c74c2p-3},
+     .table = { /* table[i] = 2^((i - 2) / 3).  */
+	        0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0}};
diff --git a/pl/math/cosh_2u.c b/pl/math/cosh_2u.c
new file mode 100644
index 0000000..5d1df07
--- /dev/null
+++ b/pl/math/cosh_2u.c
@@ -0,0 +1,66 @@
+/*
+ * Double-precision cosh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define SpecialBound                                                           \
+  0x40861da04cbafe44 /* 0x1.61da04cbafe44p+9, above which exp overflows.  */
+
+double
+__exp_dd (double, double);
+
+static double
+specialcase (double x, uint64_t iax)
+{
+  if (iax == 0x7ff0000000000000)
+    return INFINITY;
+  if (iax > 0x7ff0000000000000)
+    return __math_invalid (x);
+  /* exp overflows above SpecialBound. At this magnitude cosh(x) is dominated by
+     exp(x), so we can approximate cosh(x) by (exp(|x|/2)) ^ 2 / 2.  */
+  double t = __exp_dd (asdouble (iax) / 2, 0);
+  return (0.5 * t) * t;
+}
+
+/* Approximation for double-precision cosh(x).
+   cosh(x) = (exp(x) + exp(-x)) / 2.
+   The greatest observed error is in the special region, 1.93 ULP:
+   cosh(0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021
+			     want 0x1.fdf28623ef923p+1021.
+
+   The greatest observed error in the non-special region is 1.03 ULP:
+   cosh(0x1.502cd8e56ab3bp+0) got 0x1.fe54962842d0ep+0
+			     want 0x1.fe54962842d0fp+0.  */
+double
+cosh (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t iax = ix & AbsMask;
+
+  /* exp overflows a little bit before cosh, so use special-case handler for the
+     gap, as well as special values.  */
+  if (unlikely (iax >= SpecialBound))
+    return specialcase (x, iax);
+
+  double ax = asdouble (iax);
+  /* Use double-precision exp helper to calculate exp(x), then:
+     cosh(x) = exp(|x|) / 2 + 1 / (exp(|x| * 2).  */
+  double t = __exp_dd (ax, 0);
+  return 0.5 * t + 0.5 / t;
+}
+
+PL_SIG (S, D, 1, cosh, -10.0, 10.0)
+PL_TEST_ULP (cosh, 1.43)
+PL_TEST_INTERVAL (cosh, 0, 0x1.61da04cbafe44p+9, 100000)
+PL_TEST_INTERVAL (cosh, -0, -0x1.61da04cbafe44p+9, 100000)
+PL_TEST_INTERVAL (cosh, 0x1.61da04cbafe44p+9, 0x1p10, 1000)
+PL_TEST_INTERVAL (cosh, -0x1.61da04cbafe44p+9, -0x1p10, 1000)
+PL_TEST_INTERVAL (cosh, 0x1p10, inf, 100)
+PL_TEST_INTERVAL (cosh, -0x1p10, -inf, 100)
diff --git a/pl/math/coshf_1u9.c b/pl/math/coshf_1u9.c
new file mode 100644
index 0000000..c125c92
--- /dev/null
+++ b/pl/math/coshf_1u9.c
@@ -0,0 +1,71 @@
+/*
+ * Single-precision cosh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask 0x7fffffff
+#define TinyBound 0x20000000 /* 0x1p-63: Round to 1 below this.  */
+#define SpecialBound                                                           \
+  0x42ad496c /* 0x1.5a92d8p+6: expf overflows above this, so have to use       \
+		special case.  */
+
+float
+optr_aor_exp_f32 (float);
+
+static NOINLINE float
+specialcase (float x, uint32_t iax)
+{
+  if (iax == 0x7f800000)
+    return INFINITY;
+  if (iax > 0x7f800000)
+    return __math_invalidf (x);
+  if (iax <= TinyBound)
+    /* For tiny x, avoid underflow by just returning 1.  */
+    return 1;
+  /* Otherwise SpecialBound <= |x| < Inf. x is too large to calculate exp(x)
+     without overflow, so use exp(|x|/2) instead. For large x cosh(x) is
+     dominated by exp(x), so return:
+     cosh(x) ~= (exp(|x|/2))^2 / 2.  */
+  float t = optr_aor_exp_f32 (asfloat (iax) / 2);
+  return (0.5 * t) * t;
+}
+
+/* Approximation for single-precision cosh(x) using exp.
+   cosh(x) = (exp(x) + exp(-x)) / 2.
+   The maximum error is 1.89 ULP, observed for |x| > SpecialBound:
+   coshf(0x1.65898cp+6) got 0x1.f00aep+127 want 0x1.f00adcp+127.
+   The maximum error observed for TinyBound < |x| < SpecialBound is 1.02 ULP:
+   coshf(0x1.50a3cp+0) got 0x1.ff21dcp+0 want 0x1.ff21dap+0.  */
+float
+coshf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t iax = ix & AbsMask;
+  float ax = asfloat (iax);
+
+  if (unlikely (iax <= TinyBound || iax >= SpecialBound))
+    {
+      /* x is tiny, large or special.  */
+      return specialcase (x, iax);
+    }
+
+  /* Compute cosh using the definition:
+     coshf(x) = exp(x) / 2 + exp(-x) / 2.  */
+  float t = optr_aor_exp_f32 (ax);
+  return 0.5f * t + 0.5f / t;
+}
+
+PL_SIG (S, F, 1, cosh, -10.0, 10.0)
+PL_TEST_ULP (coshf, 1.89)
+PL_TEST_INTERVAL (coshf, 0, 0x1p-63, 100)
+PL_TEST_INTERVAL (coshf, 0, 0x1.5a92d8p+6, 80000)
+PL_TEST_INTERVAL (coshf, 0x1.5a92d8p+6, inf, 2000)
+PL_TEST_INTERVAL (coshf, -0, -0x1p-63, 100)
+PL_TEST_INTERVAL (coshf, -0, -0x1.5a92d8p+6, 80000)
+PL_TEST_INTERVAL (coshf, -0x1.5a92d8p+6, -inf, 2000)
diff --git a/pl/math/erfc_4u5.c b/pl/math/erfc_4u5.c
new file mode 100644
index 0000000..e9af9d3
--- /dev/null
+++ b/pl/math/erfc_4u5.c
@@ -0,0 +1,155 @@
+/*
+ * Double-precision erfc(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pairwise_horner.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask (0x7fffffffffffffff)
+
+#define xint __erfc_data.interval_bounds
+#define PX __erfc_data.poly
+
+/* Accurate exponential from optimized routines.  */
+double
+__exp_dd (double x, double xtail);
+
+static inline double
+eval_poly_horner (double z, int i)
+{
+  double z2 = z * z;
+#define C(j) PX[i][j]
+  return PAIRWISE_HORNER_12 (z, z2, C);
+}
+
+/* Accurate evaluation of exp(x^2)
+   using compensated product (x^2 ~ x*x + e2)
+   and the __exp_dd(y,d) routine, that is the
+   computation of exp(y+d) with a small correction d<<y.  */
+static inline double
+eval_accurate_gaussian (double a)
+{
+  double e2;
+  double a2 = a * a;
+  double aa1 = -fma (0x1.0000002p27, a, -a);
+  aa1 = fma (0x1.0000002p27, a, aa1);
+  double aa2 = a - aa1;
+  e2 = fma (-aa1, aa1, a2);
+  e2 = fma (-aa1, aa2, e2);
+  e2 = fma (-aa2, aa1, e2);
+  e2 = fma (-aa2, aa2, e2);
+  return __exp_dd (-a2, e2);
+}
+
+/* Approximation of erfc for |x| > 6.0.  */
+static inline double
+approx_erfc_hi (double x, int i)
+{
+  double a = fabs (x);
+  double z = a - xint[i];
+  double p = eval_poly_horner (z, i);
+  double e_mx2 = eval_accurate_gaussian (a);
+  return p * e_mx2;
+}
+
+static inline int
+get_itv_idx (double x)
+{
+  /* Interval bounds are a logarithmic scale, i.e. interval n has
+     lower bound 2^(n/4) - 1. Use the exponent of (|x|+1)^4 to obtain
+     the interval index.  */
+  double a = asdouble (asuint64 (x) & AbsMask);
+  double z = a + 1.0;
+  z = z * z;
+  z = z * z;
+  return (asuint64 (z) >> 52) - 1023;
+}
+
+/* Approximation of erfc for |x| < 6.0.  */
+static inline double
+approx_erfc_lo (double x, uint32_t sign, int i)
+{
+  double a = fabs (x);
+  double z = a - xint[i];
+  double p = eval_poly_horner (z, i);
+  double e_mx2 = eval_accurate_gaussian (a);
+  if (sign)
+    return fma (-p, e_mx2, 2.0);
+  else
+    return p * e_mx2;
+}
+
+/* Top 12 bits of a double (sign and exponent bits).  */
+static inline uint32_t
+abstop12 (double x)
+{
+  return (asuint64 (x) >> 52) & 0x7ff;
+}
+
+/* Top 32 bits of a double.  */
+static inline uint32_t
+top32 (double x)
+{
+  return asuint64 (x) >> 32;
+}
+
+/* Fast erfc implementation.
+   The approximation uses polynomial approximation of
+   exp(x^2) * erfc(x) with fixed orders on 20 intervals.
+   Maximum measured error is 4.05 ULPs:.
+   erfc(0x1.e8ebf6a2b0801p-2) got 0x1.ff84036f8f0b3p-2
+			     want 0x1.ff84036f8f0b7p-2.  */
+double
+erfc (double x)
+{
+  /* Get top words.  */
+  uint32_t ix = top32 (x); /* We need to compare at most 32 bits.  */
+  uint32_t ia = ix & 0x7fffffff;
+  uint32_t sign = ix >> 31;
+
+  /* Handle special cases and small values with a single comparison:
+     abstop12(x)-abstop12(small) >= abstop12(INFINITY)-abstop12(small)
+     Special cases erfc(nan)=nan, erfc(+inf)=0 and erfc(-inf)=2
+     Errno EDOM does not have to be set in case of erfc(nan).
+     Only ERANGE may be set in case of underflow.
+     Small values (|x|<small)
+       |x|<0x1.0p-56 => accurate up to 0.5 ULP (top12(0x1p-50) = 0x3c7)
+       |x|<0x1.0p-50 => accurate up to 1.0 ULP (top12(0x1p-50) = 0x3cd).  */
+  if (unlikely (abstop12 (x) - 0x3cd >= (abstop12 (INFINITY) & 0x7ff) - 0x3cd))
+    {
+      if (abstop12 (x) >= 0x7ff)
+	return (double) (sign << 1) + 1.0 / x; /* special cases.  */
+      else
+	return 1.0 - x; /* small case.  */
+    }
+  else if (ia < 0x40180000)
+    { /* |x| < 6.0.  */
+      return approx_erfc_lo (x, sign, get_itv_idx (x));
+    }
+  else if (sign)
+    { /* x <= -6.0.  */
+      return 2.0;
+    }
+  else if (ia < 0x403c0000)
+    { /* 6.0 <= x < 28.  */
+      return approx_erfc_hi (x, get_itv_idx (x));
+    }
+  else
+    { /* x > 28.  */
+      return __math_uflow (0);
+    }
+}
+
+PL_SIG (S, D, 1, erfc, -6.0, 28.0)
+PL_TEST_ULP (erfc, 3.56)
+PL_TEST_INTERVAL (erfc, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (erfc, 0x1p-1022, 0x1p-26, 40000)
+PL_TEST_INTERVAL (erfc, -0x1p-1022, -0x1p-26, 40000)
+PL_TEST_INTERVAL (erfc, 0x1p-26, 0x1p5, 40000)
+PL_TEST_INTERVAL (erfc, -0x1p-26, -0x1p3, 40000)
+PL_TEST_INTERVAL (erfc, 0, inf, 40000)
diff --git a/pl/math/erfc_data.c b/pl/math/erfc_data.c
new file mode 100644
index 0000000..fa7184f
--- /dev/null
+++ b/pl/math/erfc_data.c
@@ -0,0 +1,145 @@
+/*
+ * Data used in double-precision erfc(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Polynomial coefficients for approximating erfc(x)*exp(x*x) in double
+   precision. Generated using the Remez algorithm on each interval separately
+   (see erfc.sollya for more detail).  */
+const struct erfc_data __erfc_data = {
+
+/* Bounds for 20 intervals spanning [0x1.0p-50., 31.]. Interval bounds are a
+   logarithmic scale, i.e. interval n has lower bound 2^(n/4) - 1, with the
+   exception of the first interval.  */
+.interval_bounds = {
+  0x1.0p-50,		/* Tiny boundary.  */
+  0x1.837f05c490126p-3, /* 0.189.  */
+  0x1.a827997709f7ap-2, /* 0.414.  */
+  0x1.5d13f326fe9c8p-1, /* 0.682.  */
+  0x1.0p0,		/* 1.000.  */
+  0x1.60dfc14636e2ap0,	/* 1.378.  */
+  0x1.d413cccfe779ap0,	/* 1.828.  */
+  0x1.2e89f995ad3adp1,	/* 2.364.  */
+  0x1.8p1,		/* 3.000.  */
+  0x1.e0dfc14636e2ap1,	/* 3.757.  */
+  0x1.2a09e667f3bcdp2,	/* 4.657.  */
+  0x1.6e89f995ad3adp2,	/* 5.727.  */
+  0x1.cp2,		/* 7.000.  */
+  0x1.106fe0a31b715p3,	/* 8.514.  */
+  0x1.4a09e667f3bcdp3,	/* 10.31.  */
+  0x1.8e89f995ad3adp3,	/* 12.45.  */
+  0x1.ep3,		/* 15.00.  */
+  0x1.206fe0a31b715p4,	/* 18.03.  */
+  0x1.5a09e667f3bcdp4,	/* 21.63.  */
+  0x1.9e89f995ad3adp4,	/* 25.91.  */
+  0x1.fp4		/* 31.00.  */
+},
+
+/* Coefficients for each order 12 polynomial on each of the 20 intervals.  */
+.poly = {
+  {0x1.ffffffffffff6p-1, -0x1.20dd750429b66p0, 0x1.fffffffffffdcp-1,
+   -0x1.812746b03713ap-1, 0x1.ffffffffbe94cp-2, -0x1.341f6bb6ec9a6p-2,
+   0x1.555553a70ec2ep-3, -0x1.6023b4617a388p-4, 0x1.5550f0e40bfbap-5,
+   -0x1.38c290c0c8de8p-6, 0x1.0e84002c6274ep-7, -0x1.a599eb0ac5d04p-9,
+   0x1.c9bfafa73899cp-11},
+  {0x1.a2b43dbd503c8p-1, -0x1.a3495b7c9e6a4p-1, 0x1.535f3fb8cb92ap-1,
+   -0x1.d96ee9c714f44p-2, 0x1.26956676d2c64p-2, -0x1.4e2820da90c08p-3,
+   0x1.5ea0cffac775ap-4, -0x1.57fb82ca373e8p-5, 0x1.3e0e8f48ba0f8p-6,
+   -0x1.16a695af1bbd4p-7, 0x1.cc836241a87d4p-9, -0x1.531de41264fdap-10,
+   0x1.526a8a14e9bfcp-12},
+  {0x1.532e75821ed48p-1, -0x1.28be350460782p-1, 0x1.b08873adbf108p-2,
+   -0x1.14377569249e2p-2, 0x1.3e1ece8cd10dap-3, -0x1.5087e2e6dc2e8p-4,
+   0x1.4b3adb3bb335ap-5, -0x1.32342d711a4f4p-6, 0x1.0bc4f6ce2b656p-7,
+   -0x1.bcdaa331f2144p-9, 0x1.5c21c9e0ca954p-10, -0x1.dfdc9b3b5c402p-12,
+   0x1.b451af7dd52fep-14},
+  {0x1.10f9745a4f44ap-1, -0x1.9b03213e6963ap-2, 0x1.09b942bc8de66p-2,
+   -0x1.32755394481e4p-3, 0x1.42819b18af0e4p-4, -0x1.3a6d643aaa572p-5,
+   0x1.1f17897603eaep-6, -0x1.eefb8d3f89d42p-8, 0x1.95559544f2fbp-9,
+   -0x1.3c2a67c33338p-10, 0x1.cffa784efe6cp-12, -0x1.282646774689cp-13,
+   0x1.e654e67532b44p-16},
+  {0x1.b5d8780f956b2p-2, -0x1.17c4e3f17c04dp-2, 0x1.3c27283c328dbp-3,
+   -0x1.44837f88ea4bdp-4, 0x1.33cad0e887482p-5, -0x1.10fcf0bc8963cp-6,
+   0x1.c8cb68153ec42p-8, -0x1.6aef9a9842c54p-9, 0x1.1334345d6467cp-10,
+   -0x1.8ebe8763a2a8cp-12, 0x1.0f457219dec0dp-13, -0x1.3d2501dcd2a0fp-15,
+   0x1.d213a128a75c9p-18},
+  {0x1.5ee444130b7dbp-2, -0x1.78396ab208478p-3, 0x1.6e617ec5c0cc3p-4,
+   -0x1.49e60f63656b5p-5, 0x1.16064fddbbcb9p-6, -0x1.ba80af6a31018p-8,
+   0x1.4ec374269d4ecp-9, -0x1.e40be960703a4p-11, 0x1.4fb029f35a144p-12,
+   -0x1.be45fd71a60eap-14, 0x1.161235cd2a3e7p-15, -0x1.264890eb1b5ebp-17,
+   0x1.7f90154bde15dp-20},
+  {0x1.19a22c064d4eap-2, -0x1.f645498cae217p-4, 0x1.a0565950e3f08p-5,
+   -0x1.446605c21c178p-6, 0x1.df1231d75622fp-8, -0x1.515167553de25p-9,
+   0x1.c72c1b4a2a57fp-11, -0x1.276ae9394ecf1p-12, 0x1.71d2696d6c8c3p-14,
+   -0x1.bd4152984ce1dp-16, 0x1.f5afd2b450df7p-18, -0x1.dafdaddc7f943p-20,
+   0x1.1020f4741f79ep-22},
+  {0x1.c57f0542a7637p-3, -0x1.4e5535c17afc8p-4, 0x1.d312725242824p-6,
+   -0x1.3727cbc12a4bbp-7, 0x1.8d6730fc45b6bp-9, -0x1.e8855055c9b53p-11,
+   0x1.21f73b70cc792p-12, -0x1.4d4fe06f13831p-14, 0x1.73867a82f7484p-16,
+   -0x1.8fab204d1d75ep-18, 0x1.91d9ba10367f4p-20, -0x1.5077ce4b334ddp-22,
+   0x1.501716d098f14p-25},
+  {0x1.6e9827d229d2dp-3, -0x1.bd6ae4d14b135p-5, 0x1.043fe1a989f11p-6,
+   -0x1.259061b98cf96p-8, 0x1.409cc2b1c4fc2p-10, -0x1.53dec152f6abfp-12,
+   0x1.5e72cb4cc919fp-14, -0x1.6018b68100642p-16, 0x1.58d859380fb24p-18,
+   -0x1.471723286dad5p-20, 0x1.21c1a0f7a6593p-22, -0x1.a872678d91154p-25,
+   0x1.6eb74e2e99662p-28},
+  {0x1.29a8a4e95063ep-3, -0x1.29a8a316d3318p-5, 0x1.21876b3fe4f84p-7,
+   -0x1.1276f2d8ee36cp-9, 0x1.fbff52181a454p-12, -0x1.cb9ce9bde195ep-14,
+   0x1.9710786fa90c5p-16, -0x1.6145ad5b471dcp-18, 0x1.2c52fac57009cp-20,
+   -0x1.f02a8711f07cfp-23, 0x1.7eb574960398cp-25, -0x1.e58ce325343aap-28,
+   0x1.68510d1c32842p-31},
+  {0x1.e583024e2bc8p-4, -0x1.8fb458acb5b0fp-6, 0x1.42b9dffac2531p-8,
+   -0x1.ff9fe9a553dddp-11, 0x1.8e7e86883ba0bp-13, -0x1.313af0bb12375p-15,
+   0x1.cc29ccb17372ep-18, -0x1.55895fbb1ae42p-20, 0x1.f2bd2d6c7fd07p-23,
+   -0x1.62ec031844613p-25, 0x1.d7d69ce7c1847p-28, -0x1.0106b95e4db03p-30,
+   0x1.45aabbe505f6ap-34},
+  {0x1.8d9cbafa30408p-4, -0x1.0dd14614ed20fp-6, 0x1.6943976ea9dcap-9,
+   -0x1.dd6f05f4d7ce8p-12, 0x1.37891334aa621p-14, -0x1.91a8207766e1ep-17,
+   0x1.ffcb0c613d75cp-20, -0x1.425116a6c88dfp-22, 0x1.90cb7c902d428p-25,
+   -0x1.e70fc740c3b6dp-28, 0x1.14a09ae5851ep-30, -0x1.00f9e03eae993p-33,
+   0x1.14989aac741c2p-37},
+  {0x1.46dc6bf900f68p-4, -0x1.6e4b45246f8dp-7, 0x1.96a3de47cfdb5p-10,
+   -0x1.bf5070eb6823bp-13, 0x1.e7af6e4aa8ef8p-16, -0x1.078bf26142831p-18,
+   0x1.1a6e547aa40bep-21, -0x1.2c1c68f62f614p-24, 0x1.3bb8b473dd9e7p-27,
+   -0x1.45576cacb45a1p-30, 0x1.39ab71899b44ep-33, -0x1.ee307d46e2866p-37,
+   0x1.c21ba1b404f5ap-41},
+  {0x1.0d9a17e032288p-4, -0x1.f3e942ff4e097p-8, 0x1.cc77f09db5af8p-11,
+   -0x1.a56e8bffaab5cp-14, 0x1.7f49e36974e03p-17, -0x1.5a73fc0025d2fp-20,
+   0x1.3742ae06a8be6p-23, -0x1.15ecf5317789bp-26, 0x1.ec74dd2b109fp-30,
+   -0x1.ac28325f88dc1p-33, 0x1.5ca9e8d7841b2p-36, -0x1.cfef04667185fp-40,
+   0x1.6487c50052867p-44},
+  {0x1.be0c73cc19eddp-5, -0x1.56ce6f6c0cb33p-8, 0x1.0645980ec8568p-11,
+   -0x1.8f86f88695a8cp-15, 0x1.2ef80cb1dca7cp-18, -0x1.c97ff7c599a6dp-22,
+   0x1.57f0ac907d436p-25, -0x1.016be8d812c69p-28, 0x1.7ef6d33c73b75p-32,
+   -0x1.17f9784eda0d4p-35, 0x1.7fd8662b486f1p-39, -0x1.ae21758156d89p-43,
+   0x1.165732f1ae138p-47},
+  {0x1.71eafbd9f5877p-5, -0x1.d83714d904525p-9, 0x1.2c74dbaccea28p-12,
+   -0x1.7d27f3cdea565p-16, 0x1.e20b13581fcf8p-20, -0x1.2fe336f089679p-23,
+   0x1.7dfce36129db3p-27, -0x1.dea026ee03f14p-31, 0x1.2a6019f7c64b1p-34,
+   -0x1.6e0eeb9f98eeap-38, 0x1.a58b4ed07d741p-42, -0x1.8d12c77071e4cp-46,
+   0x1.b0241c6d5b761p-51},
+  {0x1.33714a024097ep-5, -0x1.467f441a50cbdp-9, 0x1.59fa2994d0e65p-13,
+   -0x1.6dd369d9306cap-17, 0x1.81fb2b2af9413p-21, -0x1.96604d3c1bb6ep-25,
+   0x1.aaef2da14243p-29, -0x1.bf7f1b935d3ebp-33, 0x1.d3261ebcd2061p-37,
+   -0x1.e04c803bbd875p-41, 0x1.cff98a43bacdep-45, -0x1.6ef39a63cf675p-49,
+   0x1.4f8abb4398a0dp-54},
+  {0x1.fff97acd75487p-6, -0x1.c502e8e46ec0cp-10, 0x1.903b0650672eap-14,
+   -0x1.6110aa5fb096fp-18, 0x1.36fd4c3e4040cp-22, -0x1.118489fe28728p-26,
+   0x1.e06601208ac47p-31, -0x1.a52b90c21650ap-35, 0x1.6ffc42c05429bp-39,
+   -0x1.3ce3322a6972ep-43, 0x1.009d8ef37ff8cp-47, -0x1.5498d2cc51c99p-52,
+   0x1.058cd4ea9bf04p-57},
+  {0x1.aaf347fc8c45bp-6, -0x1.3b2fd709cf97dp-10, 0x1.d0ddfb8593f4p-15,
+   -0x1.5673f4aa86542p-19, 0x1.f8048954325f6p-24, -0x1.72839959ab3e9p-28,
+   0x1.101597113be2ap-32, -0x1.8f1cf0ff4adeep-37, 0x1.23dca407fd66p-41,
+   -0x1.a4f387e57a6a5p-46, 0x1.1dafd753f65e9p-50, -0x1.3e15343c973d6p-55,
+   0x1.9a2af47d77e44p-61},
+  {0x1.64839d636f92bp-6, -0x1.b7adf7536232dp-11, 0x1.0eec0b6357148p-15,
+   -0x1.4da09b7f2c52bp-20, 0x1.9a8b146de838ep-25, -0x1.f8d1f145e7b6fp-30,
+   0x1.3624435b3ba11p-34, -0x1.7cba19b4af977p-39, 0x1.d2282481ba91ep-44,
+   -0x1.198c1e91f9564p-48, 0x1.4046224f8ccp-53, -0x1.2b1dc676c096fp-58,
+   0x1.43d3358c64dafp-64}
+}
+};
diff --git a/pl/math/erfcf.h b/pl/math/erfcf.h
new file mode 100644
index 0000000..8f1e5f4
--- /dev/null
+++ b/pl/math/erfcf.h
@@ -0,0 +1,38 @@
+/*
+ * Shared functions for scalar and vector single-precision erfc(x) functions.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_ERFCF_H
+#define PL_MATH_ERFCF_H
+
+#include "math_config.h"
+
+#define FMA fma
+#include "estrin_wrap.h"
+
+/* Accurate exponential from optimized-routines.  */
+double
+__exp_dd (double x, double xtail);
+
+static inline double
+eval_poly (double z, const double *coeff)
+{
+  double z2 = z * z;
+  double z4 = z2 * z2;
+  double z8 = z4 * z4;
+#define C(i) coeff[i]
+  return ESTRIN_15 (z, z2, z4, z8, C);
+#undef C
+}
+
+static inline double
+eval_exp_mx2 (double x)
+{
+  return __exp_dd (-(x * x), 0.0);
+}
+
+#undef FMA
+#endif // PL_MATH_ERFCF_H
diff --git a/pl/math/erfcf_2u.c b/pl/math/erfcf_2u.c
new file mode 100644
index 0000000..5a3f9b0
--- /dev/null
+++ b/pl/math/erfcf_2u.c
@@ -0,0 +1,133 @@
+/*
+ * Single-precision erfc(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "erfcf.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define P(i) __erfcf_poly_data.poly[i]
+
+/* Approximation of erfcf for |x| > 4.0.  */
+static inline float
+approx_erfcf_hi (float x, uint32_t sign, const double *coeff)
+{
+  if (sign)
+    {
+      return 2.0f;
+    }
+
+  /* Polynomial contribution.  */
+  double z = (double) fabs (x);
+  float p = (float) eval_poly (z, coeff);
+  /* Gaussian contribution.  */
+  float e_mx2 = (float) eval_exp_mx2 (z);
+
+  return p * e_mx2;
+}
+
+/* Approximation of erfcf for |x| < 4.0.  */
+static inline float
+approx_erfcf_lo (float x, uint32_t sign, const double *coeff)
+{
+  /* Polynomial contribution.  */
+  double z = (double) fabs (x);
+  float p = (float) eval_poly (z, coeff);
+  /* Gaussian contribution.  */
+  float e_mx2 = (float) eval_exp_mx2 (z);
+
+  if (sign)
+    return fmaf (-p, e_mx2, 2.0f);
+  else
+    return p * e_mx2;
+}
+
+/* Top 12 bits of a float (sign and exponent bits).  */
+static inline uint32_t
+abstop12 (float x)
+{
+  return (asuint (x) >> 20) & 0x7ff;
+}
+
+/* Top 12 bits of a float.  */
+static inline uint32_t
+top12 (float x)
+{
+  return asuint (x) >> 20;
+}
+
+/* Fast erfcf approximation using polynomial approximation
+   multiplied by gaussian.
+   Most of the computation is carried out in double precision,
+   and is very sensitive to accuracy of polynomial and exp
+   evaluation.
+   Worst-case error is 1.968ulps, obtained for x = 2.0412941.
+   erfcf(0x1.05492p+1) got 0x1.fe10f6p-9 want 0x1.fe10f2p-9 ulp
+   err 1.46788.  */
+float
+erfcf (float x)
+{
+  /* Get top words and sign.  */
+  uint32_t ix = asuint (x); /* We need to compare at most 32 bits.  */
+  uint32_t sign = ix >> 31;
+  uint32_t ia12 = top12 (x) & 0x7ff;
+
+  /* Handle special cases and small values with a single comparison:
+       abstop12(x)-abstop12(small) >= abstop12(INFINITY)-abstop12(small)
+
+     Special cases
+       erfcf(nan)=nan, erfcf(+inf)=0 and erfcf(-inf)=2
+
+     Errno
+       EDOM does not have to be set in case of erfcf(nan).
+       Only ERANGE may be set in case of underflow.
+
+     Small values (|x|<small)
+       |x|<0x1.0p-26 => accurate to 0.5 ULP (top12(0x1p-26) = 0x328).  */
+  if (unlikely (abstop12 (x) - 0x328 >= (abstop12 (INFINITY) & 0x7f8) - 0x328))
+    {
+      if (abstop12 (x) >= 0x7f8)
+	return (float) (sign << 1) + 1.0f / x; /* Special cases.  */
+      else
+	return 1.0f - x; /* Small case.  */
+    }
+
+  /* Normalized numbers divided in 4 intervals
+     with bounds: 2.0, 4.0, 8.0 and 10.0. 10 was chosen as the upper bound for
+     the interesting region as it is the smallest value, representable as a
+     12-bit integer, for which returning 0 gives <1.5 ULP.  */
+  if (ia12 < 0x400)
+    {
+      return approx_erfcf_lo (x, sign, P (0));
+    }
+  if (ia12 < 0x408)
+    {
+      return approx_erfcf_lo (x, sign, P (1));
+    }
+  if (ia12 < 0x410)
+    {
+      return approx_erfcf_hi (x, sign, P (2));
+    }
+  if (ia12 < 0x412)
+    {
+      return approx_erfcf_hi (x, sign, P (3));
+    }
+  if (sign)
+    {
+      return 2.0f;
+    }
+  return __math_uflowf (0);
+}
+
+PL_SIG (S, F, 1, erfc, -4.0, 10.0)
+PL_TEST_ULP (erfcf, 1.5)
+PL_TEST_INTERVAL (erfcf, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (erfcf, 0x1p-127, 0x1p-26, 40000)
+PL_TEST_INTERVAL (erfcf, -0x1p-127, -0x1p-26, 40000)
+PL_TEST_INTERVAL (erfcf, 0x1p-26, 0x1p5, 40000)
+PL_TEST_INTERVAL (erfcf, -0x1p-26, -0x1p3, 40000)
+PL_TEST_INTERVAL (erfcf, 0, inf, 40000)
diff --git a/pl/math/erfcf_data.c b/pl/math/erfcf_data.c
new file mode 100644
index 0000000..2e018c8
--- /dev/null
+++ b/pl/math/erfcf_data.c
@@ -0,0 +1,57 @@
+/*
+ * Data used in single-precision erfc(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Polynomial coefficients for approximating erfc(x)*exp(x*x) in double
+   precision. Generated using the Remez algorithm on each interval separately
+   (see erfcf.sollya for more detail).  */
+const struct erfcf_poly_data __erfcf_poly_data
+  = {.poly
+     = {{
+#if ERFCF_POLY_NCOEFFS == 16
+	  0x1.ffffffffe7c59p-1, -0x1.20dd74f8cecc5p0, 0x1.fffffc67a0fbdp-1,
+	  -0x1.81270c3ced2d6p-1, 0x1.fffc0c6606e45p-2, -0x1.340a779e8a8e3p-2,
+	  0x1.54c1663fc5a01p-3, -0x1.5d468c9269dafp-4, 0x1.4afe6b00df9d5p-5,
+	  -0x1.1d22d2720cb91p-6, 0x1.afa399a5761b1p-8, -0x1.113851b5858adp-9,
+	  0x1.0f992e4d5c6a4p-11, -0x1.86534d558052ap-14, 0x1.63e537bfb7cd5p-17,
+	  -0x1.32712a6275c4dp-21
+#endif
+	},
+
+	{
+#if ERFCF_POLY_NCOEFFS == 16
+	  0x1.fea5663f75cd1p-1, -0x1.1cb5a82adf1c4p0, 0x1.e7c8da942d86fp-1,
+	  -0x1.547ba0456bac7p-1, 0x1.8a6fc0f4421a4p-2, -0x1.7c14f9301ee58p-3,
+	  0x1.2f67c8351577p-4, -0x1.8e733f6d159d9p-6, 0x1.aa6a0ec249067p-8,
+	  -0x1.6f4ec45b11f3fp-10, 0x1.f4c00c4b33ba8p-13, -0x1.0795faf7846d2p-15,
+	  0x1.9cef9031810ddp-19, -0x1.c4d60c3fecdb6p-23, 0x1.360547ec2229dp-27,
+	  -0x1.8ec1581647f9fp-33
+#endif
+	},
+
+	{
+#if ERFCF_POLY_NCOEFFS == 16
+	  0x1.dae421147c591p-1, -0x1.c211957a0abfcp-1, 0x1.28a8d87aa1b12p-1,
+	  -0x1.224d2a58cbef4p-2, 0x1.b3d45dcaef898p-4, -0x1.ff99d8b33e7a9p-6,
+	  0x1.dac66375b99f6p-8, -0x1.5e1786f0f91ap-10, 0x1.9a2588deaec4fp-13,
+	  -0x1.7b886b183b235p-16, 0x1.1209e7da8ff82p-19, -0x1.2e5c870c6ed8p-23,
+	  0x1.ec6a89422928ep-28, -0x1.16e7d837b61bcp-32, 0x1.88868a73e4b43p-38,
+	  -0x1.027034672f11cp-44
+#endif
+	},
+
+	{
+#if ERFCF_POLY_NCOEFFS == 16
+	  0x1.8ae320c1bad5ap-1, -0x1.1cdd6aa6929aap-1, 0x1.0e39a7b285f58p-2,
+	  -0x1.6fb12a95e351dp-4, 0x1.77dd0649e352cp-6, -0x1.28a9e9560c461p-8,
+	  0x1.6f7d7778e9433p-11, -0x1.68363698afe4ap-14, 0x1.17e94cdf35d82p-17,
+	  -0x1.5766a817bd3ffp-21, 0x1.48d892094a2c1p-25, -0x1.e1b6511ab6d0bp-30,
+	  0x1.04c7b8143f6a4p-34, -0x1.898831961065bp-40, 0x1.71ae8a56142a6p-46,
+	  -0x1.45abac612344bp-53
+#endif
+	}}};
diff --git a/pl/math/erff_1u5.c b/pl/math/erff_1u5.c
new file mode 100644
index 0000000..1a69872
--- /dev/null
+++ b/pl/math/erff_1u5.c
@@ -0,0 +1,108 @@
+/*
+ * Single-precision erf(x) function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "estrinf.h"
+#include "hornerf.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
+#define A __erff_data.erff_poly_A
+#define B __erff_data.erff_poly_B
+
+/* Top 12 bits of a float.  */
+static inline uint32_t
+top12 (float x)
+{
+  return asuint (x) >> 20;
+}
+
+/* Efficient implementation of erff using either a pure polynomial approximation
+   or the exponential of a polynomial. Worst-case error is 1.09ulps at
+   0x1.c111acp-1.  */
+float
+erff (float x)
+{
+  float r, x2;
+
+  /* Get top word.  */
+  uint32_t ix = asuint (x);
+  uint32_t sign = ix >> 31;
+  uint32_t ia12 = top12 (x) & 0x7ff;
+
+  /* Limit of both intervals is 0.875 for performance reasons but coefficients
+     computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy
+     from 0.94 to 1.1ulps.  */
+  if (ia12 < 0x3f6)
+    { /* a = |x| < 0.875.  */
+
+      /* Tiny and subnormal cases.  */
+      if (unlikely (ia12 < 0x318))
+	{ /* |x| < 2^(-28).  */
+	  if (unlikely (ia12 < 0x040))
+	    { /* |x| < 2^(-119).  */
+	      float y = fmaf (TwoOverSqrtPiMinusOne, x, x);
+	      return check_uflowf (y);
+	    }
+	  return x + TwoOverSqrtPiMinusOne * x;
+	}
+
+      x2 = x * x;
+
+      /* Normalized cases (|x| < 0.921875) - Use Horner scheme for x+x*P(x^2).
+       */
+#define C(i) A[i]
+      r = fmaf (HORNER_5 (x2, C), x, x);
+#undef C
+    }
+  else if (ia12 < 0x408)
+    { /* |x| < 4.0 - Use a custom Estrin scheme.  */
+
+      float a = fabsf (x);
+      /* Use Estrin scheme on high order (small magnitude) coefficients.  */
+#define C(i) B[i]
+      r = ESTRIN_3_ (a, x * x, C, 3);
+#undef C
+      /* Then switch to pure Horner scheme.  */
+      r = fmaf (r, a, B[2]);
+      r = fmaf (r, a, B[1]);
+      r = fmaf (r, a, B[0]);
+      r = fmaf (r, a, a);
+      /* Single precision exponential with ~0.5ulps ensures erff has maximum
+	 relative error below 1ulp on [0.921875, 4.0] and below 1.1ulps on
+	 [0.875, 4.0].  */
+      r = expf (-r);
+      /* Explicit copysign (calling copysignf increases latency).  */
+      if (sign)
+	r = -1.0f + r;
+      else
+	r = 1.0f - r;
+    }
+  else
+    { /* |x| >= 4.0.  */
+
+      /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1.  */
+      if (unlikely (ia12 >= 0x7f8))
+	return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x;
+
+      /* Explicit copysign (calling copysignf increases latency).  */
+      if (sign)
+	r = -1.0f;
+      else
+	r = 1.0f;
+    }
+  return r;
+}
+
+PL_SIG (S, F, 1, erf, -4.0, 4.0)
+PL_TEST_ULP (erff, 0.6)
+PL_TEST_INTERVAL (erff, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (erff, 0x1p-127, 0x1p-26, 40000)
+PL_TEST_INTERVAL (erff, -0x1p-127, -0x1p-26, 40000)
+PL_TEST_INTERVAL (erff, 0x1p-26, 0x1p3, 40000)
+PL_TEST_INTERVAL (erff, -0x1p-26, -0x1p3, 40000)
+PL_TEST_INTERVAL (erff, 0, inf, 40000)
diff --git a/pl/math/erff_data.c b/pl/math/erff_data.c
new file mode 100644
index 0000000..2352bae
--- /dev/null
+++ b/pl/math/erff_data.c
@@ -0,0 +1,16 @@
+/*
+ * Data for approximation of erff.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Minimax approximation of erff.  */
+const struct erff_data __erff_data
+  = {.erff_poly_A = {0x1.06eba6p-03f, -0x1.8126e0p-02f, 0x1.ce1a46p-04f,
+		     -0x1.b68bd2p-06f, 0x1.473f48p-08f, -0x1.3a1a82p-11f},
+     .erff_poly_B
+     = {0x1.079d0cp-3f, 0x1.450aa0p-1f, 0x1.b55cb0p-4f, -0x1.8d6300p-6f,
+	0x1.fd1336p-9f, -0x1.91d2ccp-12f, 0x1.222900p-16f}};
diff --git a/pl/math/estrin.h b/pl/math/estrin.h
new file mode 100644
index 0000000..f967fb0
--- /dev/null
+++ b/pl/math/estrin.h
@@ -0,0 +1,16 @@
+/*
+ * Helper macros for double-precision Estrin polynomial evaluation.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#if V_SUPPORTED
+#define FMA v_fma_f64
+#else
+#define FMA fma
+#endif
+
+#include "estrin_wrap.h"
diff --git a/pl/math/estrin_wrap.h b/pl/math/estrin_wrap.h
new file mode 100644
index 0000000..2ae0700
--- /dev/null
+++ b/pl/math/estrin_wrap.h
@@ -0,0 +1,48 @@
+/*
+ * Helper macros for double-precision Estrin polynomial evaluation.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+// clang-format off
+#define  ESTRIN_1_(x,                  c, i) FMA(x,   c(1 + i),                        c(i))
+#define  ESTRIN_2_(x, x2,              c, i) FMA(x2,  c(2 + i),                        ESTRIN_1_(x,              c, i))
+#define  ESTRIN_3_(x, x2,              c, i) FMA(x2,  ESTRIN_1_(x,         c,  2 + i), ESTRIN_1_(x,              c, i))
+#define  ESTRIN_4_(x, x2, x4,          c, i) FMA(x4,  c(4 + i),                        ESTRIN_3_(x, x2,          c, i))
+#define  ESTRIN_5_(x, x2, x4,          c, i) FMA(x4,  ESTRIN_1_(x,         c,  4 + i), ESTRIN_3_(x, x2,          c, i))
+#define  ESTRIN_6_(x, x2, x4,          c, i) FMA(x4,  ESTRIN_2_(x, x2,     c,  4 + i), ESTRIN_3_(x, x2,          c, i))
+#define  ESTRIN_7_(x, x2, x4,          c, i) FMA(x4,  ESTRIN_3_(x, x2,     c,  4 + i), ESTRIN_3_(x, x2,          c, i))
+#define  ESTRIN_8_(x, x2, x4, x8,      c, i) FMA(x8,  c(8 + i),                        ESTRIN_7_(x, x2, x4,      c, i))
+#define  ESTRIN_9_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_1_(x,         c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_10_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_2_(x, x2,     c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_11_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_3_(x, x2,     c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_12_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_4_(x, x2, x4, c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_13_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_5_(x, x2, x4, c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_14_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_6_(x, x2, x4, c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_15_(x, x2, x4, x8,      c, i) FMA(x8,  ESTRIN_7_(x, x2, x4, c,  8 + i), ESTRIN_7_(x, x2, x4,      c, i))
+#define ESTRIN_16_(x, x2, x4, x8, x16, c, i) FMA(x16, c(16 + i),                       ESTRIN_15_(x, x2, x4, x8, c, i))
+#define ESTRIN_17_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_1_(x,         c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i))
+#define ESTRIN_18_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_2_(x, x2,     c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i))
+#define ESTRIN_19_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_3_(x, x2,     c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i))
+
+#define  ESTRIN_1(x,                  c)  ESTRIN_1_(x,                  c, 0)
+#define  ESTRIN_2(x, x2,              c)  ESTRIN_2_(x, x2,              c, 0)
+#define  ESTRIN_3(x, x2,              c)  ESTRIN_3_(x, x2,              c, 0)
+#define  ESTRIN_4(x, x2, x4,          c)  ESTRIN_4_(x, x2, x4,          c, 0)
+#define  ESTRIN_5(x, x2, x4,          c)  ESTRIN_5_(x, x2, x4,          c, 0)
+#define  ESTRIN_6(x, x2, x4,          c)  ESTRIN_6_(x, x2, x4,          c, 0)
+#define  ESTRIN_7(x, x2, x4,          c)  ESTRIN_7_(x, x2, x4,          c, 0)
+#define  ESTRIN_8(x, x2, x4, x8,      c)  ESTRIN_8_(x, x2, x4, x8,      c, 0)
+#define  ESTRIN_9(x, x2, x4, x8,      c)  ESTRIN_9_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_10(x, x2, x4, x8,      c) ESTRIN_10_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_11(x, x2, x4, x8,      c) ESTRIN_11_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_12(x, x2, x4, x8,      c) ESTRIN_12_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_13(x, x2, x4, x8,      c) ESTRIN_13_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_14(x, x2, x4, x8,      c) ESTRIN_14_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_15(x, x2, x4, x8,      c) ESTRIN_15_(x, x2, x4, x8,      c, 0)
+#define ESTRIN_16(x, x2, x4, x8, x16, c) ESTRIN_16_(x, x2, x4, x8, x16, c, 0)
+#define ESTRIN_17(x, x2, x4, x8, x16, c) ESTRIN_17_(x, x2, x4, x8, x16, c, 0)
+#define ESTRIN_18(x, x2, x4, x8, x16, c) ESTRIN_18_(x, x2, x4, x8, x16, c, 0)
+#define ESTRIN_19(x, x2, x4, x8, x16, c) ESTRIN_19_(x, x2, x4, x8, x16, c, 0)
+// clang-format on
diff --git a/pl/math/estrinf.h b/pl/math/estrinf.h
new file mode 100644
index 0000000..175233c
--- /dev/null
+++ b/pl/math/estrinf.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for single-precision Estrin polynomial evaluation.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f32
+#else
+#define FMA fmaf
+#endif
+
+#include "estrin_wrap.h"
diff --git a/pl/math/exp.c b/pl/math/exp.c
new file mode 100644
index 0000000..90253b6
--- /dev/null
+++ b/pl/math/exp.c
@@ -0,0 +1,163 @@
+/*
+ * Double-precision e^x function.
+ *
+ * Copyright (c) 2018-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include <float.h>
+#include <math.h>
+#include <stdint.h>
+#include "math_config.h"
+
+#define N (1 << EXP_TABLE_BITS)
+#define InvLn2N __exp_data.invln2N
+#define NegLn2hiN __exp_data.negln2hiN
+#define NegLn2loN __exp_data.negln2loN
+#define Shift __exp_data.shift
+#define T __exp_data.tab
+#define C2 __exp_data.poly[5 - EXP_POLY_ORDER]
+#define C3 __exp_data.poly[6 - EXP_POLY_ORDER]
+#define C4 __exp_data.poly[7 - EXP_POLY_ORDER]
+#define C5 __exp_data.poly[8 - EXP_POLY_ORDER]
+#define C6 __exp_data.poly[9 - EXP_POLY_ORDER]
+
+/* Handle cases that may overflow or underflow when computing the result that
+   is scale*(1+TMP) without intermediate rounding.  The bit representation of
+   scale is in SBITS, however it has a computed exponent that may have
+   overflown into the sign bit so that needs to be adjusted before using it as
+   a double.  (int32_t)KI is the k used in the argument reduction and exponent
+   adjustment of scale, positive k here means the result may overflow and
+   negative k means the result may underflow.  */
+static inline double
+specialcase (double_t tmp, uint64_t sbits, uint64_t ki)
+{
+  double_t scale, y;
+
+  if ((ki & 0x80000000) == 0)
+    {
+      /* k > 0, the exponent of scale might have overflowed by <= 460.  */
+      sbits -= 1009ull << 52;
+      scale = asdouble (sbits);
+      y = 0x1p1009 * (scale + scale * tmp);
+      return check_oflow (eval_as_double (y));
+    }
+  /* k < 0, need special care in the subnormal range.  */
+  sbits += 1022ull << 52;
+  scale = asdouble (sbits);
+  y = scale + scale * tmp;
+  if (y < 1.0)
+    {
+      /* Round y to the right precision before scaling it into the subnormal
+	 range to avoid double rounding that can cause 0.5+E/2 ulp error where
+	 E is the worst-case ulp error outside the subnormal range.  So this
+	 is only useful if the goal is better than 1 ulp worst-case error.  */
+      double_t hi, lo;
+      lo = scale - y + scale * tmp;
+      hi = 1.0 + y;
+      lo = 1.0 - hi + y + lo;
+      y = eval_as_double (hi + lo) - 1.0;
+      /* Avoid -0.0 with downward rounding.  */
+      if (WANT_ROUNDING && y == 0.0)
+	y = 0.0;
+      /* The underflow exception needs to be signaled explicitly.  */
+      force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022);
+    }
+  y = 0x1p-1022 * y;
+  return check_uflow (eval_as_double (y));
+}
+
+/* Top 12 bits of a double (sign and exponent bits).  */
+static inline uint32_t
+top12 (double x)
+{
+  return asuint64 (x) >> 52;
+}
+
+/* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
+   If hastail is 0 then xtail is assumed to be 0 too.  */
+static inline double
+exp_inline (double x, double xtail, int hastail)
+{
+  uint32_t abstop;
+  uint64_t ki, idx, top, sbits;
+  /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
+  double_t kd, z, r, r2, scale, tail, tmp;
+
+  abstop = top12 (x) & 0x7ff;
+  if (unlikely (abstop - top12 (0x1p-54) >= top12 (512.0) - top12 (0x1p-54)))
+    {
+      if (abstop - top12 (0x1p-54) >= 0x80000000)
+	/* Avoid spurious underflow for tiny x.  */
+	/* Note: 0 is common input.  */
+	return WANT_ROUNDING ? 1.0 + x : 1.0;
+      if (abstop >= top12 (1024.0))
+	{
+	  if (asuint64 (x) == asuint64 (-INFINITY))
+	    return 0.0;
+	  if (abstop >= top12 (INFINITY))
+	    return 1.0 + x;
+	  if (asuint64 (x) >> 63)
+	    return __math_uflow (0);
+	  else
+	    return __math_oflow (0);
+	}
+      /* Large x is special cased below.  */
+      abstop = 0;
+    }
+
+  /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
+  /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N].  */
+  z = InvLn2N * x;
+#if TOINT_INTRINSICS
+  kd = roundtoint (z);
+  ki = converttoint (z);
+#elif EXP_USE_TOINT_NARROW
+  /* z - kd is in [-0.5-2^-16, 0.5] in all rounding modes.  */
+  kd = eval_as_double (z + Shift);
+  ki = asuint64 (kd) >> 16;
+  kd = (double_t) (int32_t) ki;
+#else
+  /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
+  kd = eval_as_double (z + Shift);
+  ki = asuint64 (kd);
+  kd -= Shift;
+#endif
+  r = x + kd * NegLn2hiN + kd * NegLn2loN;
+  /* The code assumes 2^-200 < |xtail| < 2^-8/N.  */
+  if (hastail)
+    r += xtail;
+  /* 2^(k/N) ~= scale * (1 + tail).  */
+  idx = 2 * (ki % N);
+  top = ki << (52 - EXP_TABLE_BITS);
+  tail = asdouble (T[idx]);
+  /* This is only a valid scale when -1023*N < k < 1024*N.  */
+  sbits = T[idx + 1] + top;
+  /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1).  */
+  /* Evaluation is optimized assuming superscalar pipelined execution.  */
+  r2 = r * r;
+  /* Without fma the worst case error is 0.25/N ulp larger.  */
+  /* Worst case error is less than 0.5+1.11/N+(abs poly error * 2^53) ulp.  */
+#if EXP_POLY_ORDER == 4
+  tmp = tail + r + r2 * C2 + r * r2 * (C3 + r * C4);
+#elif EXP_POLY_ORDER == 5
+  tmp = tail + r + r2 * (C2 + r * C3) + r2 * r2 * (C4 + r * C5);
+#elif EXP_POLY_ORDER == 6
+  tmp = tail + r + r2 * (0.5 + r * C3) + r2 * r2 * (C4 + r * C5 + r2 * C6);
+#endif
+  if (unlikely (abstop == 0))
+    return specialcase (tmp, sbits, ki);
+  scale = asdouble (sbits);
+  /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
+     is no spurious underflow here even without fma.  */
+  return eval_as_double (scale + scale * tmp);
+}
+
+/* May be useful for implementing pow where more than double
+   precision input is needed.  */
+double
+__exp_dd (double x, double xtail)
+{
+  return exp_inline (x, xtail, 1);
+}
+
diff --git a/pl/math/exp_data.c b/pl/math/exp_data.c
new file mode 100644
index 0000000..2354be7
--- /dev/null
+++ b/pl/math/exp_data.c
@@ -0,0 +1,1120 @@
+/*
+ * Shared data between exp, exp2 and pow.
+ *
+ * Copyright (c) 2018-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define N (1 << EXP_TABLE_BITS)
+
+const struct exp_data __exp_data = {
+// N/ln2
+.invln2N = 0x1.71547652b82fep0 * N,
+// -ln2/N
+#if N == 64
+.negln2hiN = -0x1.62e42fefa0000p-7,
+.negln2loN = -0x1.cf79abc9e3b3ap-46,
+#elif N == 128
+.negln2hiN = -0x1.62e42fefa0000p-8,
+.negln2loN = -0x1.cf79abc9e3b3ap-47,
+#elif N == 256
+.negln2hiN = -0x1.62e42fefc0000p-9,
+.negln2loN = 0x1.c610ca86c3899p-45,
+#elif N == 512
+.negln2hiN = -0x1.62e42fef80000p-10,
+.negln2loN = -0x1.1cf79abc9e3b4p-45,
+#endif
+// Used for rounding when !TOINT_INTRINSICS
+#if EXP_USE_TOINT_NARROW
+.shift = 0x1800000000.8p0,
+#else
+.shift = 0x1.8p52,
+#endif
+// exp polynomial coefficients.
+.poly = {
+#if N == 64 && EXP_POLY_ORDER == 5 && !EXP_POLY_WIDE
+// abs error: 1.5543*2^-60
+// ulp error: 0.529 (0.533 without fma)
+// if |x| < ln2/128+eps
+// abs error if |x| < ln2/64: 1.7157*2^-50
+0x1.fffffffffdbcdp-2,
+0x1.555555555444cp-3,
+0x1.555573c6a9f7dp-5,
+0x1.1111266d28935p-7,
+#elif N == 64 && EXP_POLY_ORDER == 6 && EXP_POLY_WIDE
+// abs error: 1.6735*2^-64
+// ulp error: 0.518 (0.522 without fma)
+// if |x| < ln2/64
+0x1.5555555548f9ap-3,
+0x1.555555554bf5dp-5,
+0x1.11115b75f0f4dp-7,
+0x1.6c171a6b6303ep-10,
+#elif N == 128 && EXP_POLY_ORDER == 5 && !EXP_POLY_WIDE
+// abs error: 1.555*2^-66
+// ulp error: 0.509 (0.511 without fma)
+// if |x| < ln2/256+eps
+// abs error if |x| < ln2/256+0x1p-15: 1.09*2^-65
+// abs error if |x| < ln2/128: 1.7145*2^-56
+0x1.ffffffffffdbdp-2,
+0x1.555555555543cp-3,
+0x1.55555cf172b91p-5,
+0x1.1111167a4d017p-7,
+#elif N == 128 && EXP_POLY_ORDER == 5 && EXP_POLY_WIDE
+// abs error: 1.5542*2^-60
+// ulp error: 0.521 (0.523 without fma)
+// if |x| < ln2/128
+0x1.fffffffffdbcep-2,
+0x1.55555555543c2p-3,
+0x1.555573c64f2e3p-5,
+0x1.111126b4eff73p-7,
+#elif N == 128 && EXP_POLY_ORDER == 6 && EXP_POLY_WIDE
+// abs error: 1.6861*2^-71
+// ulp error: 0.509 (0.511 without fma)
+// if |x| < ln2/128
+0x1.55555555548fdp-3,
+0x1.555555555658fp-5,
+0x1.111123a859bb6p-7,
+0x1.6c16ba6920cabp-10,
+#elif N == 256 && EXP_POLY_ORDER == 4 && !EXP_POLY_WIDE
+// abs error: 1.43*2^-58
+// ulp error: 0.549 (0.550 without fma)
+// if |x| < ln2/512
+0x1p0, // unused
+0x1.fffffffffffd4p-2,
+0x1.5555571d6ef9p-3,
+0x1.5555576a5adcep-5,
+#elif N == 256 && EXP_POLY_ORDER == 5 && EXP_POLY_WIDE
+// abs error: 1.5547*2^-66
+// ulp error: 0.505 (0.506 without fma)
+// if |x| < ln2/256
+0x1.ffffffffffdbdp-2,
+0x1.555555555543cp-3,
+0x1.55555cf16e1edp-5,
+0x1.1111167a4b553p-7,
+#elif N == 512 && EXP_POLY_ORDER == 4 && !EXP_POLY_WIDE
+// abs error: 1.4300*2^-63
+// ulp error: 0.504
+// if |x| < ln2/1024
+// abs error if |x| < ln2/512: 1.0689*2^-55
+0x1p0, // unused
+0x1.ffffffffffffdp-2,
+0x1.555555c75bb6p-3,
+0x1.555555dec04a8p-5,
+#endif
+},
+.exp2_shift = 0x1.8p52 / N,
+// exp2 polynomial coefficients.
+.exp2_poly = {
+#if N == 64 && EXP2_POLY_ORDER == 6 && EXP2_POLY_WIDE
+// abs error: 1.3054*2^-63
+// ulp error: 0.515
+// if |x| < 1/64
+0x1.62e42fefa39efp-1,
+0x1.ebfbdff82c58fp-3,
+0x1.c6b08d7045cf1p-5,
+0x1.3b2ab6fb8fd0ep-7,
+0x1.5d884afec48d7p-10,
+0x1.43097dc684ae1p-13,
+#elif N == 128 && EXP2_POLY_ORDER == 5 && !EXP2_POLY_WIDE
+// abs error: 1.2195*2^-65
+// ulp error: 0.507 (0.511 without fma)
+// if |x| < 1/256
+// abs error if |x| < 1/128: 1.9941*2^-56
+0x1.62e42fefa39efp-1,
+0x1.ebfbdff82c424p-3,
+0x1.c6b08d70cf4b5p-5,
+0x1.3b2abd24650ccp-7,
+0x1.5d7e09b4e3a84p-10,
+#elif N == 256 && EXP2_POLY_ORDER == 5 && EXP2_POLY_WIDE
+// abs error: 1.2195*2^-65
+// ulp error: 0.504 (0.508 without fma)
+// if |x| < 1/256
+0x1.62e42fefa39efp-1,
+0x1.ebfbdff82c424p-3,
+0x1.c6b08d70cf4b5p-5,
+0x1.3b2abd24650ccp-7,
+0x1.5d7e09b4e3a84p-10,
+#elif N == 512 && EXP2_POLY_ORDER == 4 && !EXP2_POLY_WIDE
+// abs error: 1.4411*2^-64
+// ulp error: 0.5024 (0.5063 without fma)
+// if |x| < 1/1024
+// abs error if |x| < 1/512: 1.9430*2^-56
+0x1.62e42fefa39ecp-1,
+0x1.ebfbdff82c58bp-3,
+0x1.c6b08e46de41fp-5,
+0x1.3b2ab786ee1dap-7,
+#endif
+},
+// 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
+// tab[2*k] = asuint64(T[k])
+// tab[2*k+1] = asuint64(H[k]) - (k << 52)/N
+.tab = {
+#if N == 64
+0x0, 0x3ff0000000000000,
+0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
+0x3c8cd2523567f613, 0x3fefd9b0d3158574,
+0x3c60f74e61e6c861, 0x3fefc74518759bc8,
+0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
+0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
+0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
+0xbc91c923b9d5f416, 0x3fef829aaea92de0,
+0xbc801b15eaa59348, 0x3fef72b83c7d517b,
+0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
+0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
+0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
+0x3c968efde3a8a894, 0x3fef387a6e756238,
+0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
+0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
+0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
+0x3c834d754db0abb6, 0x3fef06fe0a31b715,
+0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
+0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
+0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
+0x3c859f48a72a4c6d, 0x3feedea64c123422,
+0xbc58a78f4817895b, 0x3feed60a21f72e2a,
+0x3c4363ed60c2ac11, 0x3feece086061892d,
+0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
+0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
+0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
+0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
+0x3c93350518fdd78e, 0x3feeaf4736b527da,
+0x3c9063e1e21c5409, 0x3feeab07dd485429,
+0x3c9432e62b64c035, 0x3feea76f15ad2148,
+0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
+0xbc93cedd78565858, 0x3feea23882552225,
+0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
+0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
+0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
+0xbc8619321e55e68a, 0x3fee9feb564267c9,
+0xbc7b32dcb94da51d, 0x3feea11473eb0187,
+0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
+0xbc9369b6f13b3734, 0x3feea589994cce13,
+0xbc94d450d872576e, 0x3feea8d99b4492ed,
+0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
+0x3c7bf68359f35f44, 0x3feeb1ae99157736,
+0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
+0xbc92434322f4f9aa, 0x3feebd829fde4e50,
+0x3c71affc2b91ce27, 0x3feec49182a3f090,
+0xbc87c50422622263, 0x3feecc667b5de565,
+0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
+0x3c8469846e735ab3, 0x3feede6b5579fdbf,
+0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
+0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
+0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
+0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
+0x3c736eae30af0cb3, 0x3fef199bdd85529c,
+0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
+0x3c676b2c6c921968, 0x3fef3720dcef9069,
+0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
+0x3c74a385a63d07a7, 0x3fef5818dcfba487,
+0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
+0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
+0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
+0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
+0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
+0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
+0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
+#elif N == 128
+0x0, 0x3ff0000000000000,
+0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335,
+0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
+0xbc905e7a108766d1, 0x3fefe315e86e7f85,
+0x3c8cd2523567f613, 0x3fefd9b0d3158574,
+0xbc8bce8023f98efa, 0x3fefd06b29ddf6de,
+0x3c60f74e61e6c861, 0x3fefc74518759bc8,
+0x3c90a3e45b33d399, 0x3fefbe3ecac6f383,
+0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
+0x3c8eb51a92fdeffc, 0x3fefac922b7247f7,
+0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
+0xbc6a033489906e0b, 0x3fef9b66affed31b,
+0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
+0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc,
+0xbc91c923b9d5f416, 0x3fef829aaea92de0,
+0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51,
+0xbc801b15eaa59348, 0x3fef72b83c7d517b,
+0xbc8f1ff055de323d, 0x3fef6af9388c8dea,
+0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
+0xbc96d99c7611eb26, 0x3fef5be084045cd4,
+0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
+0xbc8fe782cb86389d, 0x3fef4d5022fcd91d,
+0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
+0x3c807a05b0e4047d, 0x3fef3f49917ddc96,
+0x3c968efde3a8a894, 0x3fef387a6e756238,
+0x3c875e18f274487d, 0x3fef31ce4fb2a63f,
+0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
+0xbc96b87b3f71085e, 0x3fef24dfe1f56381,
+0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
+0xbc3d219b1a6fbffa, 0x3fef187fd0dad990,
+0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
+0x3c6e149289cecb8f, 0x3fef0cafa93e2f56,
+0x3c834d754db0abb6, 0x3fef06fe0a31b715,
+0x3c864201e2ac744c, 0x3fef0170fc4cd831,
+0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
+0xbc86a3803b8e5b04, 0x3feef6c55f929ff1,
+0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
+0xbc9907f81b512d8e, 0x3feeecae6d05d866,
+0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
+0xbc991919b3ce1b15, 0x3feee32dc313a8e5,
+0x3c859f48a72a4c6d, 0x3feedea64c123422,
+0xbc9312607a28698a, 0x3feeda4504ac801c,
+0xbc58a78f4817895b, 0x3feed60a21f72e2a,
+0xbc7c2c9b67499a1b, 0x3feed1f5d950a897,
+0x3c4363ed60c2ac11, 0x3feece086061892d,
+0x3c9666093b0664ef, 0x3feeca41ed1d0057,
+0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
+0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de,
+0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
+0x3c931dbdeb54e077, 0x3feebcb299fddd0d,
+0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
+0xbc87deccdc93a349, 0x3feeb6daa2cf6642,
+0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
+0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f,
+0x3c93350518fdd78e, 0x3feeaf4736b527da,
+0x3c7b98b72f8a9b05, 0x3feead12d497c7fd,
+0x3c9063e1e21c5409, 0x3feeab07dd485429,
+0x3c34c7855019c6ea, 0x3feea9268a5946b7,
+0x3c9432e62b64c035, 0x3feea76f15ad2148,
+0xbc8ce44a6199769f, 0x3feea5e1b976dc09,
+0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
+0xbc845378892be9ae, 0x3feea34634ccc320,
+0xbc93cedd78565858, 0x3feea23882552225,
+0x3c5710aa807e1964, 0x3feea155d44ca973,
+0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
+0xbc6a12ad8734b982, 0x3feea012750bdabf,
+0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
+0xbc80dc3d54e08851, 0x3fee9f7df9519484,
+0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
+0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174,
+0xbc8619321e55e68a, 0x3fee9feb564267c9,
+0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f,
+0xbc7b32dcb94da51d, 0x3feea11473eb0187,
+0x3c94ecfd5467c06b, 0x3feea1ed0130c132,
+0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
+0xbc88a1c52fb3cf42, 0x3feea427543e1a12,
+0xbc9369b6f13b3734, 0x3feea589994cce13,
+0xbc805e843a19ff1e, 0x3feea71a4623c7ad,
+0xbc94d450d872576e, 0x3feea8d99b4492ed,
+0x3c90ad675b0e8a00, 0x3feeaac7d98a6699,
+0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
+0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c,
+0x3c7bf68359f35f44, 0x3feeb1ae99157736,
+0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6,
+0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
+0xbc6c23f97c90b959, 0x3feeba44cbc8520f,
+0xbc92434322f4f9aa, 0x3feebd829fde4e50,
+0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba,
+0x3c71affc2b91ce27, 0x3feec49182a3f090,
+0x3c6dd235e10a73bb, 0x3feec86319e32323,
+0xbc87c50422622263, 0x3feecc667b5de565,
+0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33,
+0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
+0x3c90cc319cee31d2, 0x3feed99e1330b358,
+0x3c8469846e735ab3, 0x3feede6b5579fdbf,
+0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a,
+0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
+0xbc907b8f4ad1d9fa, 0x3feeee07298db666,
+0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
+0xbc90a40e3da6f640, 0x3feef9728de5593a,
+0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
+0xbc91eee26b588a35, 0x3fef05b030a1064a,
+0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
+0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09,
+0x3c736eae30af0cb3, 0x3fef199bdd85529c,
+0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a,
+0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
+0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5,
+0x3c676b2c6c921968, 0x3fef3720dcef9069,
+0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa,
+0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
+0xbc900dae3875a949, 0x3fef4f87080d89f2,
+0x3c74a385a63d07a7, 0x3fef5818dcfba487,
+0xbc82919e2040220f, 0x3fef60e316c98398,
+0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
+0x3c843a59ac016b4b, 0x3fef7321f301b460,
+0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
+0xbc892ab93b470dc9, 0x3fef864614f5a129,
+0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
+0x3c83c5ec519d7271, 0x3fef9a51fbc74c83,
+0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
+0xbc8dae98e223747d, 0x3fefaf482d8e67f1,
+0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
+0x3c842b94c3a9eb32, 0x3fefc52b376bba97,
+0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
+0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14,
+0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
+0x3c5305c14160cc89, 0x3feff3c22b8f71f1,
+#elif N == 256
+0x0, 0x3ff0000000000000,
+0xbc84e82fc61851ac, 0x3feffb1afa5abcbf,
+0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335,
+0xbc82985dd8521d32, 0x3feff168143b0281,
+0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
+0x3c651e617061bfbd, 0x3fefe7d42e11bbcc,
+0xbc905e7a108766d1, 0x3fefe315e86e7f85,
+0x3c845fad437fa426, 0x3fefde5f72f654b1,
+0x3c8cd2523567f613, 0x3fefd9b0d3158574,
+0xbc954529642b232f, 0x3fefd50a0e3c1f89,
+0xbc8bce8023f98efa, 0x3fefd06b29ddf6de,
+0x3c8293708ef5c32e, 0x3fefcbd42b72a836,
+0x3c60f74e61e6c861, 0x3fefc74518759bc8,
+0xbc95b9280905b2a4, 0x3fefc2bdf66607e0,
+0x3c90a3e45b33d399, 0x3fefbe3ecac6f383,
+0x3c84f31f32c4b7e7, 0x3fefb9c79b1f3919,
+0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
+0x3c9407fb30d06420, 0x3fefb0f145e46c85,
+0x3c8eb51a92fdeffc, 0x3fefac922b7247f7,
+0xbc9a5d04b3b9911b, 0x3fefa83b23395dec,
+0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
+0xbc937a01f0739546, 0x3fef9fa55fdfa9c5,
+0xbc6a033489906e0b, 0x3fef9b66affed31b,
+0x3c8b8268b04ef0a5, 0x3fef973028d7233e,
+0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
+0xbc9ac46e44a2ebcc, 0x3fef8edbab5e2ab6,
+0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc,
+0xbc65704e90c9f860, 0x3fef86a814f204ab,
+0xbc91c923b9d5f416, 0x3fef829aaea92de0,
+0xbc897cea57e46280, 0x3fef7e95934f312e,
+0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51,
+0x3c56f01429e2b9d2, 0x3fef76a45471c3c2,
+0xbc801b15eaa59348, 0x3fef72b83c7d517b,
+0x3c6e653b2459034b, 0x3fef6ed48695bbc0,
+0xbc8f1ff055de323d, 0x3fef6af9388c8dea,
+0x3c92cc7ea345b7dc, 0x3fef672658375d2f,
+0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
+0x3c957bfb2876ea9e, 0x3fef5f99f8138a1c,
+0xbc96d99c7611eb26, 0x3fef5be084045cd4,
+0x3c8cdc1873af2155, 0x3fef582f95281c6b,
+0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
+0xbc9493684653a131, 0x3fef50e75eb44027,
+0xbc8fe782cb86389d, 0x3fef4d5022fcd91d,
+0xbc98e2899077520a, 0x3fef49c18438ce4d,
+0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
+0x3c9120fcd4f59273, 0x3fef42be3578a819,
+0x3c807a05b0e4047d, 0x3fef3f49917ddc96,
+0x3c89b788c188c9b8, 0x3fef3bdda27912d1,
+0x3c968efde3a8a894, 0x3fef387a6e756238,
+0x3c877afbca90ef84, 0x3fef351ffb82140a,
+0x3c875e18f274487d, 0x3fef31ce4fb2a63f,
+0x3c91512f082876ee, 0x3fef2e85711ece75,
+0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
+0x3c9a02f0c7d75ec6, 0x3fef280e341ddf29,
+0xbc96b87b3f71085e, 0x3fef24dfe1f56381,
+0xbc803297e78260bf, 0x3fef21ba7591bb70,
+0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
+0xbc95b77e5ccd9fbf, 0x3fef1b8a66d10f13,
+0xbc3d219b1a6fbffa, 0x3fef187fd0dad990,
+0xbc91e75c40b4251e, 0x3fef157e39771b2f,
+0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
+0x3c98a911f1f7785a, 0x3fef0f961f641589,
+0x3c6e149289cecb8f, 0x3fef0cafa93e2f56,
+0xbc61e7c998db7dbb, 0x3fef09d24abd886b,
+0x3c834d754db0abb6, 0x3fef06fe0a31b715,
+0x3c85425c11faadf4, 0x3fef0432edeeb2fd,
+0x3c864201e2ac744c, 0x3fef0170fc4cd831,
+0xbc979517a03e2847, 0x3feefeb83ba8ea32,
+0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
+0xbc800e2a46da4bee, 0x3feef96266e3fa2d,
+0xbc86a3803b8e5b04, 0x3feef6c55f929ff1,
+0xbc87430803972b34, 0x3feef431a2de883b,
+0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
+0xbc954de30ae02d94, 0x3feeef26231e754a,
+0xbc9907f81b512d8e, 0x3feeecae6d05d866,
+0xbc94f2487e1c03ec, 0x3feeea401b7140ef,
+0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
+0x3c914a5432fcb2f4, 0x3feee57fbfec6cf4,
+0xbc991919b3ce1b15, 0x3feee32dc313a8e5,
+0x3c79c3bba5562a2f, 0x3feee0e544ede173,
+0x3c859f48a72a4c6d, 0x3feedea64c123422,
+0xbc85a71612e21658, 0x3feedc70df1c5175,
+0xbc9312607a28698a, 0x3feeda4504ac801c,
+0x3c86421f6f1d24d6, 0x3feed822c367a024,
+0xbc58a78f4817895b, 0x3feed60a21f72e2a,
+0xbc9348a6815fce65, 0x3feed3fb2709468a,
+0xbc7c2c9b67499a1b, 0x3feed1f5d950a897,
+0x3c835c43984d9871, 0x3feecffa3f84b9d4,
+0x3c4363ed60c2ac11, 0x3feece086061892d,
+0xbc632afc8d9473a0, 0x3feecc2042a7d232,
+0x3c9666093b0664ef, 0x3feeca41ed1d0057,
+0xbc95fc5e44de020e, 0x3feec86d668b3237,
+0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
+0xbc7ea0148327c42f, 0x3feec4e1e192aed2,
+0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de,
+0xbc7a843ad1a88022, 0x3feec17dea6db7d7,
+0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
+0x3c892ca3bf144e63, 0x3feebe41b817c114,
+0x3c931dbdeb54e077, 0x3feebcb299fddd0d,
+0xbc902c99b04aa8b0, 0x3feebb2d81d8abff,
+0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
+0x3c73e34f67e67118, 0x3feeb8417f4531ee,
+0xbc87deccdc93a349, 0x3feeb6daa2cf6642,
+0xbc75a3b1197ba0f0, 0x3feeb57de83f4eef,
+0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
+0x3c81bd2888075068, 0x3feeb2e2f4f6ad27,
+0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f,
+0xbc896be8ae89ef8f, 0x3feeb070dde910d2,
+0x3c93350518fdd78e, 0x3feeaf4736b527da,
+0xbc88e6ac90348602, 0x3feeae27dbe2c4cf,
+0x3c7b98b72f8a9b05, 0x3feead12d497c7fd,
+0xbc91af7f1365c3ac, 0x3feeac0827ff07cc,
+0x3c9063e1e21c5409, 0x3feeab07dd485429,
+0xbc943a3540d1898a, 0x3feeaa11fba87a03,
+0x3c34c7855019c6ea, 0x3feea9268a5946b7,
+0xbc951f58ddaa8090, 0x3feea84590998b93,
+0x3c9432e62b64c035, 0x3feea76f15ad2148,
+0xbc82e1648e50a17c, 0x3feea6a320dceb71,
+0xbc8ce44a6199769f, 0x3feea5e1b976dc09,
+0x3c95f30eda98a575, 0x3feea52ae6cdf6f4,
+0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
+0x3c917ecda8a72159, 0x3feea3dd1d1929fd,
+0xbc845378892be9ae, 0x3feea34634ccc320,
+0xbc9345f3cee1ae6e, 0x3feea2b9febc8fb7,
+0xbc93cedd78565858, 0x3feea23882552225,
+0xbc85c33fdf910406, 0x3feea1c1c70833f6,
+0x3c5710aa807e1964, 0x3feea155d44ca973,
+0x3c81079ab5789604, 0x3feea0f4b19e9538,
+0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
+0x3c727df161cd7778, 0x3feea052fa75173e,
+0xbc6a12ad8734b982, 0x3feea012750bdabf,
+0x3c93f9924a05b767, 0x3fee9fdcddd47645,
+0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
+0xbc87557939a8b5ef, 0x3fee9f9298593ae5,
+0xbc80dc3d54e08851, 0x3fee9f7df9519484,
+0x3c51ed2f56fa9d1a, 0x3fee9f7466f42e87,
+0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
+0xbc88e67a9006c909, 0x3fee9f8286ead08a,
+0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174,
+0x3c86597566977ac8, 0x3fee9fbd35d7cbfd,
+0xbc8619321e55e68a, 0x3fee9feb564267c9,
+0x3c92c0b7028a5c3a, 0x3feea024b1ab6e09,
+0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f,
+0x3c8a30faf49cc78c, 0x3feea0b938ac1cf6,
+0xbc7b32dcb94da51d, 0x3feea11473eb0187,
+0xbc92dad3519d7b5b, 0x3feea17b0976cfdb,
+0x3c94ecfd5467c06b, 0x3feea1ed0130c132,
+0x3c87d51410fd15c2, 0x3feea26a62ff86f0,
+0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
+0xbc760a3629969871, 0x3feea3878491c491,
+0xbc88a1c52fb3cf42, 0x3feea427543e1a12,
+0x3c8b18c6e3fdef5d, 0x3feea4d2add106d9,
+0xbc9369b6f13b3734, 0x3feea589994cce13,
+0x3c90ec1ddcb1390a, 0x3feea64c1eb941f7,
+0xbc805e843a19ff1e, 0x3feea71a4623c7ad,
+0xbc522cea4f3afa1e, 0x3feea7f4179f5b21,
+0xbc94d450d872576e, 0x3feea8d99b4492ed,
+0x3c7c88549b958471, 0x3feea9cad931a436,
+0x3c90ad675b0e8a00, 0x3feeaac7d98a6699,
+0x3c931143962f7877, 0x3feeabd0a478580f,
+0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
+0x3c93e9e96f112479, 0x3feeae05bad61778,
+0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c,
+0xbc8dac42a4a38df0, 0x3feeb06a5e0866d9,
+0x3c7bf68359f35f44, 0x3feeb1ae99157736,
+0x3c8b99dd98b1ed84, 0x3feeb2fed0282c8a,
+0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6,
+0xbc7885ad50cbb750, 0x3feeb5c353aa2fe2,
+0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
+0xbc82d5e85f3e0301, 0x3feeb8b82b5f98e5,
+0xbc6c23f97c90b959, 0x3feeba44cbc8520f,
+0xbc51669428996971, 0x3feebbdd9a7670b3,
+0xbc92434322f4f9aa, 0x3feebd829fde4e50,
+0x3c71f2b2c1c4c014, 0x3feebf33e47a22a2,
+0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba,
+0xbc9294f304f166b6, 0x3feec2bb4d53fe0d,
+0x3c71affc2b91ce27, 0x3feec49182a3f090,
+0xbc8a1e58414c07d3, 0x3feec674194bb8d5,
+0x3c6dd235e10a73bb, 0x3feec86319e32323,
+0xbc79740b58a20091, 0x3feeca5e8d07f29e,
+0xbc87c50422622263, 0x3feecc667b5de565,
+0x3c9165830a2b96c2, 0x3feece7aed8eb8bb,
+0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33,
+0xbc903d5cbe27874b, 0x3feed2c980460ad8,
+0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
+0x3c5986178980fce0, 0x3feed74a8af46052,
+0x3c90cc319cee31d2, 0x3feed99e1330b358,
+0xbc89472975b1f2a5, 0x3feedbfe53c12e59,
+0x3c8469846e735ab3, 0x3feede6b5579fdbf,
+0x3c7d8157a34b7e7f, 0x3feee0e521356eba,
+0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a,
+0x3c8c8a4e231ebb7d, 0x3feee5ff3a3c2774,
+0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
+0xbc888c8d11a142e5, 0x3feeeb4ce622f2ff,
+0xbc907b8f4ad1d9fa, 0x3feeee07298db666,
+0x3c889c2ea41433c7, 0x3feef0ce6c9a8952,
+0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
+0xbc7274aedac8ff80, 0x3feef68415b749b1,
+0xbc90a40e3da6f640, 0x3feef9728de5593a,
+0x3c85c620ce76df06, 0x3feefc6e29f1c52a,
+0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
+0xbc8fda52e1b51e41, 0x3fef028cf22749e4,
+0xbc91eee26b588a35, 0x3fef05b030a1064a,
+0xbc32141a7b3e2cd8, 0x3fef08e0b79a6f1f,
+0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
+0xbc302899507554e5, 0x3fef0f69c3f3a207,
+0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09,
+0xbc80dda2d4c0010c, 0x3fef16286141b33d,
+0x3c736eae30af0cb3, 0x3fef199bdd85529c,
+0xbc8a007daadf8d68, 0x3fef1d1cd9fa652c,
+0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a,
+0x3c836909391181d3, 0x3fef244778fafb22,
+0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
+0xbc811cd7dbdf9547, 0x3fef2ba88988c933,
+0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5,
+0xbc7ac28b7bef6621, 0x3fef33405751c4db,
+0x3c676b2c6c921968, 0x3fef3720dcef9069,
+0xbc7030587207b9e1, 0x3fef3b0f2e6d1675,
+0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa,
+0xbc8cc734592af7fc, 0x3fef43155b5bab74,
+0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
+0x3c87752a44f587e8, 0x3fef4b532b08c968,
+0xbc900dae3875a949, 0x3fef4f87080d89f2,
+0x3c85b66fefeef52e, 0x3fef53c8eacaa1d6,
+0x3c74a385a63d07a7, 0x3fef5818dcfba487,
+0x3c5159d9d908a96e, 0x3fef5c76e862e6d3,
+0xbc82919e2040220f, 0x3fef60e316c98398,
+0x3c8c254d16117a68, 0x3fef655d71ff6075,
+0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
+0xbc8d8c329fbd0e03, 0x3fef6e7cd63a8315,
+0x3c843a59ac016b4b, 0x3fef7321f301b460,
+0xbc8ea6e6fbd5f2a6, 0x3fef77d5641c0658,
+0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
+0xbc63e8e3eab2cbb4, 0x3fef81676b197d17,
+0xbc892ab93b470dc9, 0x3fef864614f5a129,
+0xbc8b7966cd0d2cd9, 0x3fef8b333b16ee12,
+0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
+0xbc776caa4c2ff1cf, 0x3fef953924676d76,
+0x3c83c5ec519d7271, 0x3fef9a51fbc74c83,
+0xbc81d5fc525d9940, 0x3fef9f7977cdb740,
+0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
+0x3c855cd8aaea3d21, 0x3fefa9f4867cca6e,
+0xbc8dae98e223747d, 0x3fefaf482d8e67f1,
+0x3c8269947c2bed4a, 0x3fefb4aaa2188510,
+0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
+0xbc83b6137e9afe9e, 0x3fefbf9c1cb6412a,
+0x3c842b94c3a9eb32, 0x3fefc52b376bba97,
+0xbc69fa74878ba7c7, 0x3fefcac948dd7274,
+0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
+0x3c901f3a75ee0efe, 0x3fefd632798844f8,
+0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14,
+0xbc516a9ce6ed84fa, 0x3fefe1d802243c89,
+0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
+0xbc699c7db2effc76, 0x3fefedba3692d514,
+0x3c5305c14160cc89, 0x3feff3c22b8f71f1,
+0x3c64b458677f9840, 0x3feff9d96b2a23d9,
+#elif N == 512
+0x0, 0x3ff0000000000000,
+0xbc75d87ade1f60d5, 0x3feffd8c86da1c0a,
+0xbc84e82fc61851ac, 0x3feffb1afa5abcbf,
+0x3c9bffdaa7ac4bac, 0x3feff8ab5b2cbd11,
+0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335,
+0x3c75c18e5ae0563a, 0x3feff3d1e77170b4,
+0xbc82985dd8521d32, 0x3feff168143b0281,
+0xbc705b1125cf49a5, 0x3fefef003103b10e,
+0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
+0x3c9f879abbff3f87, 0x3fefea363d42b027,
+0x3c651e617061bfbd, 0x3fefe7d42e11bbcc,
+0x3c9b14003824712a, 0x3fefe57411915a8a,
+0xbc905e7a108766d1, 0x3fefe315e86e7f85,
+0x3c61cbf0f38af658, 0x3fefe0b9b35659d8,
+0x3c845fad437fa426, 0x3fefde5f72f654b1,
+0xbc9a3316383dcbc5, 0x3fefdc0727fc1762,
+0x3c8cd2523567f613, 0x3fefd9b0d3158574,
+0x3c9901c9e0e797fd, 0x3fefd75c74f0bec2,
+0xbc954529642b232f, 0x3fefd50a0e3c1f89,
+0xbc89b3236d111646, 0x3fefd2b99fa6407c,
+0xbc8bce8023f98efa, 0x3fefd06b29ddf6de,
+0xbc8cb191be99b1b0, 0x3fefce1ead925493,
+0x3c8293708ef5c32e, 0x3fefcbd42b72a836,
+0xbc9acb71e83765b7, 0x3fefc98ba42e7d30,
+0x3c60f74e61e6c861, 0x3fefc74518759bc8,
+0x3c5cd3e58b03697e, 0x3fefc50088f8093f,
+0xbc95b9280905b2a4, 0x3fefc2bdf66607e0,
+0xbc8bfb07d4755452, 0x3fefc07d61701716,
+0x3c90a3e45b33d399, 0x3fefbe3ecac6f383,
+0x3c8aedeb3e7b14cd, 0x3fefbc02331b9715,
+0x3c84f31f32c4b7e7, 0x3fefb9c79b1f3919,
+0x3c9a8eb1f3d914b4, 0x3fefb78f03834e52,
+0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
+0xbc85b9eb0402507b, 0x3fefb323d833d93f,
+0x3c9407fb30d06420, 0x3fefb0f145e46c85,
+0xbc93f0f225bbf3ee, 0x3fefaec0b6bdae53,
+0x3c8eb51a92fdeffc, 0x3fefac922b7247f7,
+0xbc9c3fe7282d1784, 0x3fefaa65a4b520ba,
+0xbc9a5d04b3b9911b, 0x3fefa83b23395dec,
+0x3c9c8be44bf4cde8, 0x3fefa612a7b26300,
+0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
+0x3c820c5444c93c44, 0x3fefa1c7c55189c6,
+0xbc937a01f0739546, 0x3fef9fa55fdfa9c5,
+0xbc84c6baeb580d7a, 0x3fef9d8503328e6d,
+0xbc6a033489906e0b, 0x3fef9b66affed31b,
+0x3c8657aa1b0d9f83, 0x3fef994a66f951ce,
+0x3c8b8268b04ef0a5, 0x3fef973028d7233e,
+0x3c62f2c7fd6ee145, 0x3fef9517f64d9ef1,
+0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
+0xbc6b0b2789925e90, 0x3fef90edb6db2dc1,
+0xbc9ac46e44a2ebcc, 0x3fef8edbab5e2ab6,
+0xbc93aad17d197fae, 0x3fef8ccbae51a5c8,
+0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc,
+0xbc989c464a07ad70, 0x3fef88b1e264a0e9,
+0xbc65704e90c9f860, 0x3fef86a814f204ab,
+0xbc72c338fce197f4, 0x3fef84a058cbae1e,
+0xbc91c923b9d5f416, 0x3fef829aaea92de0,
+0xbc6dca724cea0eb6, 0x3fef809717425438,
+0xbc897cea57e46280, 0x3fef7e95934f312e,
+0x3c464770b955d34d, 0x3fef7c962388149e,
+0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51,
+0xbc962811c114424f, 0x3fef789d83606e12,
+0x3c56f01429e2b9d2, 0x3fef76a45471c3c2,
+0x3c8ec58e74904dd4, 0x3fef74ad3c92df73,
+0xbc801b15eaa59348, 0x3fef72b83c7d517b,
+0x3c8d63b0ab2d5bbf, 0x3fef70c554eaea89,
+0x3c6e653b2459034b, 0x3fef6ed48695bbc0,
+0xbc9ca9effbeeac92, 0x3fef6ce5d23816c9,
+0xbc8f1ff055de323d, 0x3fef6af9388c8dea,
+0x3c8bda920de0f6e2, 0x3fef690eba4df41f,
+0x3c92cc7ea345b7dc, 0x3fef672658375d2f,
+0xbc9a597f9a5ff71c, 0x3fef654013041dc2,
+0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
+0x3c50835b125aa573, 0x3fef6179e2363cf8,
+0x3c957bfb2876ea9e, 0x3fef5f99f8138a1c,
+0x3c8aaa13d61aec1f, 0x3fef5dbc2dc40bf0,
+0xbc96d99c7611eb26, 0x3fef5be084045cd4,
+0x3c8a4f81aa7110bd, 0x3fef5a06fb91588f,
+0x3c8cdc1873af2155, 0x3fef582f95281c6b,
+0xbc6817fd6a313e3e, 0x3fef565a51860746,
+0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
+0xbc96236af85fd26a, 0x3fef52b6358e15e8,
+0xbc9493684653a131, 0x3fef50e75eb44027,
+0x3c7795eb4523abe7, 0x3fef4f1aad999e82,
+0xbc8fe782cb86389d, 0x3fef4d5022fcd91d,
+0x3c8fe58b91b40095, 0x3fef4b87bf9cda38,
+0xbc98e2899077520a, 0x3fef49c18438ce4d,
+0x3c91ecaa860c614a, 0x3fef47fd7190241e,
+0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
+0xbc3e45c83ba0bbcb, 0x3fef447bc96ffc18,
+0x3c9120fcd4f59273, 0x3fef42be3578a819,
+0xbc29fd3bea07b4ee, 0x3fef4102cd3d09b9,
+0x3c807a05b0e4047d, 0x3fef3f49917ddc96,
+0x3c87f1c7350e256d, 0x3fef3d9282fc1f27,
+0x3c89b788c188c9b8, 0x3fef3bdda27912d1,
+0x3c420dac6c124f4f, 0x3fef3a2af0b63bff,
+0x3c968efde3a8a894, 0x3fef387a6e756238,
+0xbc99501d09bc09fd, 0x3fef36cc1c78903a,
+0x3c877afbca90ef84, 0x3fef351ffb82140a,
+0x3c73baf864dc8675, 0x3fef33760c547f15,
+0x3c875e18f274487d, 0x3fef31ce4fb2a63f,
+0x3c91b0575c1eaf54, 0x3fef3028c65fa1ff,
+0x3c91512f082876ee, 0x3fef2e85711ece75,
+0xbc90364bc9ce33ab, 0x3fef2ce450b3cb82,
+0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
+0xbc7548165d85ed32, 0x3fef29a8b16f0a30,
+0x3c9a02f0c7d75ec6, 0x3fef280e341ddf29,
+0x3c7c3b977a68e32c, 0x3fef2675eeb3ab98,
+0xbc96b87b3f71085e, 0x3fef24dfe1f56381,
+0xbc93a255f697ecfe, 0x3fef234c0ea83f36,
+0xbc803297e78260bf, 0x3fef21ba7591bb70,
+0x3c8d2d19edc1e550, 0x3fef202b17779965,
+0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
+0xbc76b2173113dd8c, 0x3fef1d130f50d65c,
+0xbc95b77e5ccd9fbf, 0x3fef1b8a66d10f13,
+0x3c811aa5f853590b, 0x3fef1a03fc675d1f,
+0xbc3d219b1a6fbffa, 0x3fef187fd0dad990,
+0x3c61d61a34c8aa02, 0x3fef16fde4f2e280,
+0xbc91e75c40b4251e, 0x3fef157e39771b2f,
+0xbc91f892bf6b286d, 0x3fef1400cf2f6c18,
+0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
+0x3c7590c65c20e680, 0x3fef110cc15d5346,
+0x3c98a911f1f7785a, 0x3fef0f961f641589,
+0x3c86fe320b5c1e9d, 0x3fef0e21c1c14833,
+0x3c6e149289cecb8f, 0x3fef0cafa93e2f56,
+0xbc903cd8b2f25790, 0x3fef0b3fd6a454d2,
+0xbc61e7c998db7dbb, 0x3fef09d24abd886b,
+0x3c7b3bf786a54a87, 0x3fef08670653dfe4,
+0x3c834d754db0abb6, 0x3fef06fe0a31b715,
+0x3c74bb6c41732885, 0x3fef05975721b004,
+0x3c85425c11faadf4, 0x3fef0432edeeb2fd,
+0xbc99d7399abb9a8b, 0x3fef02d0cf63eeac,
+0x3c864201e2ac744c, 0x3fef0170fc4cd831,
+0xbc5451d60c6ac9eb, 0x3fef001375752b40,
+0xbc979517a03e2847, 0x3feefeb83ba8ea32,
+0x3c8787a210ceafd9, 0x3feefd5f4fb45e20,
+0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
+0xbc888d1e4629943d, 0x3feefab46484ebb4,
+0xbc800e2a46da4bee, 0x3feef96266e3fa2d,
+0xbc93369c544088b6, 0x3feef812ba4ea77d,
+0xbc86a3803b8e5b04, 0x3feef6c55f929ff1,
+0x3c85373ce4eb6dfb, 0x3feef57a577dd72b,
+0xbc87430803972b34, 0x3feef431a2de883b,
+0x3c83adec8265a67f, 0x3feef2eb428335b4,
+0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
+0xbc835388bcac6bc5, 0x3feef06581d3f669,
+0xbc954de30ae02d94, 0x3feeef26231e754a,
+0x3c727cdb4e4b6640, 0x3feeede91be9c811,
+0xbc9907f81b512d8e, 0x3feeecae6d05d866,
+0x3c86c2696a26af35, 0x3feeeb761742d808,
+0xbc94f2487e1c03ec, 0x3feeea401b7140ef,
+0x3c888f6ff06b979a, 0x3feee90c7a61d55b,
+0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
+0xbc89d5efaabc2030, 0x3feee6ac4bcdf3ea,
+0x3c914a5432fcb2f4, 0x3feee57fbfec6cf4,
+0xbc76b8867f91c9d6, 0x3feee4559212ef89,
+0xbc991919b3ce1b15, 0x3feee32dc313a8e5,
+0x3c94c9c0b5157fe6, 0x3feee20853c10f28,
+0x3c79c3bba5562a2f, 0x3feee0e544ede173,
+0xbc62455345b51c8e, 0x3feedfc4976d27fa,
+0x3c859f48a72a4c6d, 0x3feedea64c123422,
+0xbc93331de45477d0, 0x3feedd8a63b0a09b,
+0xbc85a71612e21658, 0x3feedc70df1c5175,
+0xbc95f84d39b39b16, 0x3feedb59bf29743f,
+0xbc9312607a28698a, 0x3feeda4504ac801c,
+0xbc72ba4dc7c4d562, 0x3feed932b07a35df,
+0x3c86421f6f1d24d6, 0x3feed822c367a024,
+0xbc844f25dc02691f, 0x3feed7153e4a136a,
+0xbc58a78f4817895b, 0x3feed60a21f72e2a,
+0xbc888d328eb9b501, 0x3feed5016f44d8f5,
+0xbc9348a6815fce65, 0x3feed3fb2709468a,
+0x3c7f0bec42ddb15a, 0x3feed2f74a1af3f1,
+0xbc7c2c9b67499a1b, 0x3feed1f5d950a897,
+0xbc615f0a2b9cd452, 0x3feed0f6d5817663,
+0x3c835c43984d9871, 0x3feecffa3f84b9d4,
+0xbc8c2e465a919e1d, 0x3feecf0018321a1a,
+0x3c4363ed60c2ac11, 0x3feece086061892d,
+0xbc865dfd02bd08f1, 0x3feecd1318eb43ec,
+0xbc632afc8d9473a0, 0x3feecc2042a7d232,
+0xbc8e68cec89b1762, 0x3feecb2fde7006f4,
+0x3c9666093b0664ef, 0x3feeca41ed1d0057,
+0xbc48ae858eb682ca, 0x3feec9566f8827d0,
+0xbc95fc5e44de020e, 0x3feec86d668b3237,
+0x3c5dd71277c0915f, 0x3feec786d3001fe5,
+0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
+0x3c92001325ecd7fb, 0x3feec5c10fa920a1,
+0xbc7ea0148327c42f, 0x3feec4e1e192aed2,
+0x3c65ace6e2870332, 0x3feec4052c5916c4,
+0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de,
+0xbc9595c55690ffaf, 0x3feec2532feaada6,
+0xbc7a843ad1a88022, 0x3feec17dea6db7d7,
+0xbc8b401ba9fb5199, 0x3feec0ab213d5283,
+0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
+0x3c6df82bf324cc57, 0x3feebf0d073537ca,
+0x3c892ca3bf144e63, 0x3feebe41b817c114,
+0x3c97cae38641c7bb, 0x3feebd78e8bb586b,
+0x3c931dbdeb54e077, 0x3feebcb299fddd0d,
+0x3c62d80c5c4a2b67, 0x3feebbeeccbd7b2a,
+0xbc902c99b04aa8b0, 0x3feebb2d81d8abff,
+0x3c8f39c10d12eaf0, 0x3feeba6eba2e35f0,
+0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
+0xbc80b582d74a55d9, 0x3feeb8f8b804f127,
+0x3c73e34f67e67118, 0x3feeb8417f4531ee,
+0xbc6b4e327ff434ca, 0x3feeb78ccd3deb0d,
+0xbc87deccdc93a349, 0x3feeb6daa2cf6642,
+0xbc592dca38593e20, 0x3feeb62b00da3b14,
+0xbc75a3b1197ba0f0, 0x3feeb57de83f4eef,
+0xbc85daca9994833e, 0x3feeb4d359dfd53d,
+0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
+0xbc980b4321bc6dae, 0x3feeb385df598d78,
+0x3c81bd2888075068, 0x3feeb2e2f4f6ad27,
+0xbc8390afec5241c5, 0x3feeb24298571b06,
+0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f,
+0x3c8f15cdafe7d586, 0x3feeb1098bed1bdf,
+0xbc896be8ae89ef8f, 0x3feeb070dde910d2,
+0xbc910aa91ae9b67f, 0x3feeafdac1351819,
+0x3c93350518fdd78e, 0x3feeaf4736b527da,
+0x3c957e1b67462375, 0x3feeaeb63f4d854c,
+0xbc88e6ac90348602, 0x3feeae27dbe2c4cf,
+0x3c8124d5051552a7, 0x3feead9c0d59ca07,
+0x3c7b98b72f8a9b05, 0x3feead12d497c7fd,
+0xbc3ca103952ecf1f, 0x3feeac8c32824135,
+0xbc91af7f1365c3ac, 0x3feeac0827ff07cc,
+0x3c773345c02a4fd6, 0x3feeab86b5f43d92,
+0x3c9063e1e21c5409, 0x3feeab07dd485429,
+0xbc909d2a0fce20f2, 0x3feeaa8b9ee20d1e,
+0xbc943a3540d1898a, 0x3feeaa11fba87a03,
+0xbc924f2cb4f81746, 0x3feea99af482fc8f,
+0x3c34c7855019c6ea, 0x3feea9268a5946b7,
+0xbc943592a0a9846b, 0x3feea8b4be135acc,
+0xbc951f58ddaa8090, 0x3feea84590998b93,
+0xbc956bc85d444f4f, 0x3feea7d902d47c65,
+0x3c9432e62b64c035, 0x3feea76f15ad2148,
+0x3c914d1e4218319f, 0x3feea707ca0cbf0f,
+0xbc82e1648e50a17c, 0x3feea6a320dceb71,
+0x3c971c93709313f4, 0x3feea6411b078d26,
+0xbc8ce44a6199769f, 0x3feea5e1b976dc09,
+0x3c7f88303b60d222, 0x3feea584fd15612a,
+0x3c95f30eda98a575, 0x3feea52ae6cdf6f4,
+0x3c70125ca18d4b5b, 0x3feea4d3778bc944,
+0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
+0x3c9592ea73798b11, 0x3feea42c91c56acd,
+0x3c917ecda8a72159, 0x3feea3dd1d1929fd,
+0xbc9371d6d7d75739, 0x3feea390532205d8,
+0xbc845378892be9ae, 0x3feea34634ccc320,
+0xbc8ac05fd996f807, 0x3feea2fec30678b7,
+0xbc9345f3cee1ae6e, 0x3feea2b9febc8fb7,
+0xbc91f5067d03653a, 0x3feea277e8dcc390,
+0xbc93cedd78565858, 0x3feea23882552225,
+0x3c917339c86ce3ad, 0x3feea1fbcc140be7,
+0xbc85c33fdf910406, 0x3feea1c1c70833f6,
+0xbc77e66065ba2500, 0x3feea18a7420a036,
+0x3c5710aa807e1964, 0x3feea155d44ca973,
+0x3c964c827ee6b49a, 0x3feea123e87bfb7a,
+0x3c81079ab5789604, 0x3feea0f4b19e9538,
+0xbc928311a3c73480, 0x3feea0c830a4c8d4,
+0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
+0x3c882c79e185e981, 0x3feea077541ee718,
+0x3c727df161cd7778, 0x3feea052fa75173e,
+0xbc8b48cea80b043b, 0x3feea0315a736c75,
+0xbc6a12ad8734b982, 0x3feea012750bdabf,
+0xbc4f4863bc8e5180, 0x3fee9ff64b30aa09,
+0x3c93f9924a05b767, 0x3fee9fdcddd47645,
+0x3c954835dd4b7548, 0x3fee9fc62dea2f8a,
+0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
+0xbc8bf41f59b59f8a, 0x3fee9fa10a38cee8,
+0xbc87557939a8b5ef, 0x3fee9f9298593ae5,
+0xbc8f652fde52775c, 0x3fee9f86e7ba9fef,
+0xbc80dc3d54e08851, 0x3fee9f7df9519484,
+0xbc7b0300defbcf98, 0x3fee9f77ce1303f6,
+0x3c51ed2f56fa9d1a, 0x3fee9f7466f42e87,
+0xbc89dab646035dc0, 0x3fee9f73c4eaa988,
+0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
+0xbc91f0c230588dde, 0x3fee9f7ad3ef9011,
+0xbc88e67a9006c909, 0x3fee9f8286ead08a,
+0x3c9106450507a28c, 0x3fee9f8d02d50b8f,
+0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174,
+0xbc9129729a10f3a0, 0x3fee9faa5953c849,
+0x3c86597566977ac8, 0x3fee9fbd35d7cbfd,
+0x3c781a70a5124f67, 0x3fee9fd2df29ce7c,
+0xbc8619321e55e68a, 0x3fee9feb564267c9,
+0x3c941626ea62646d, 0x3feea0069c1a861d,
+0x3c92c0b7028a5c3a, 0x3feea024b1ab6e09,
+0xbc940b9f54365b7c, 0x3feea04597eeba8f,
+0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f,
+0x3c873455e0e826c1, 0x3feea08fda749e5d,
+0x3c8a30faf49cc78c, 0x3feea0b938ac1cf6,
+0x3c94f006ad874e3e, 0x3feea0e56b7fcf03,
+0xbc7b32dcb94da51d, 0x3feea11473eb0187,
+0xbc8f6d693d0973bb, 0x3feea14652e958aa,
+0xbc92dad3519d7b5b, 0x3feea17b0976cfdb,
+0x3c58c5ee2b7e7848, 0x3feea1b2988fb9ec,
+0x3c94ecfd5467c06b, 0x3feea1ed0130c132,
+0xbc88b25e045d207b, 0x3feea22a4456e7a3,
+0x3c87d51410fd15c2, 0x3feea26a62ff86f0,
+0xbc69cb3314060ca7, 0x3feea2ad5e2850ac,
+0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
+0x3c87a0b15d19e0bb, 0x3feea33bedf2e1b9,
+0xbc760a3629969871, 0x3feea3878491c491,
+0x3c94aa7212bfa73c, 0x3feea3d5fbab091f,
+0xbc88a1c52fb3cf42, 0x3feea427543e1a12,
+0xbc81e688272a8a12, 0x3feea47b8f4abaa9,
+0x3c8b18c6e3fdef5d, 0x3feea4d2add106d9,
+0x3c4ab7b7112ec9d5, 0x3feea52cb0d1736a,
+0xbc9369b6f13b3734, 0x3feea589994cce13,
+0x3c8a1e274eed4476, 0x3feea5e968443d9a,
+0x3c90ec1ddcb1390a, 0x3feea64c1eb941f7,
+0x3c94a533a59324da, 0x3feea6b1bdadb46d,
+0xbc805e843a19ff1e, 0x3feea71a4623c7ad,
+0x3c7a56d2760d087d, 0x3feea785b91e07f1,
+0xbc522cea4f3afa1e, 0x3feea7f4179f5b21,
+0x3c91682c1c6e8b05, 0x3feea86562ab00ec,
+0xbc94d450d872576e, 0x3feea8d99b4492ed,
+0x3c89ea99cf7a9591, 0x3feea950c27004c2,
+0x3c7c88549b958471, 0x3feea9cad931a436,
+0xbc59e57d8f92ff8e, 0x3feeaa47e08e1957,
+0x3c90ad675b0e8a00, 0x3feeaac7d98a6699,
+0x3c909b176e05a9cd, 0x3feeab4ac52be8f7,
+0x3c931143962f7877, 0x3feeabd0a478580f,
+0x3c711607f1952c95, 0x3feeac597875c644,
+0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
+0x3c869608f0f86431, 0x3feead74029db01e,
+0x3c93e9e96f112479, 0x3feeae05bad61778,
+0xbc7f1ced15c5c5c0, 0x3feeae9a6bdb5598,
+0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c,
+0x3c614b97be3f7b4e, 0x3feeafccbc6c19e6,
+0xbc8dac42a4a38df0, 0x3feeb06a5e0866d9,
+0x3c81c1701c359530, 0x3feeb10afc931857,
+0x3c7bf68359f35f44, 0x3feeb1ae99157736,
+0xbc8edb1bf6809287, 0x3feeb2553499284b,
+0x3c8b99dd98b1ed84, 0x3feeb2fed0282c8a,
+0xbc8ba58ce7a736d3, 0x3feeb3ab6ccce12c,
+0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6,
+0xbc93fc025e1db9ce, 0x3feeb50dad829e70,
+0xbc7885ad50cbb750, 0x3feeb5c353aa2fe2,
+0xbc8d737c7d71382e, 0x3feeb67bff148396,
+0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
+0x3c6ae88c43905293, 0x3feeb7f669e2802b,
+0xbc82d5e85f3e0301, 0x3feeb8b82b5f98e5,
+0xbc93d1f7661fe51b, 0x3feeb97cf65253d1,
+0xbc6c23f97c90b959, 0x3feeba44cbc8520f,
+0x3c651b68797ffc1c, 0x3feebb0faccf9243,
+0xbc51669428996971, 0x3feebbdd9a7670b3,
+0x3c54579c5ceed70b, 0x3feebcae95cba768,
+0xbc92434322f4f9aa, 0x3feebd829fde4e50,
+0x3c87298413381667, 0x3feebe59b9bddb5b,
+0x3c71f2b2c1c4c014, 0x3feebf33e47a22a2,
+0xbc905000be64e965, 0x3feec01121235681,
+0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba,
+0xbc89fb12e3454b73, 0x3feec1d4d47f2598,
+0xbc9294f304f166b6, 0x3feec2bb4d53fe0d,
+0x3c7be2a03697693b, 0x3feec3a4dc5a3dd3,
+0x3c71affc2b91ce27, 0x3feec49182a3f090,
+0x3c90622b15810eea, 0x3feec581414380f2,
+0xbc8a1e58414c07d3, 0x3feec674194bb8d5,
+0x3be9a5ecc875d327, 0x3feec76a0bcfc15e,
+0x3c6dd235e10a73bb, 0x3feec86319e32323,
+0x3c88ea486a3350ef, 0x3feec95f4499c647,
+0xbc79740b58a20091, 0x3feeca5e8d07f29e,
+0xbc7a2ee551d4c40f, 0x3feecb60f4424fcb,
+0xbc87c50422622263, 0x3feecc667b5de565,
+0x3c89c31f7e38028b, 0x3feecd6f23701b15,
+0x3c9165830a2b96c2, 0x3feece7aed8eb8bb,
+0xbc5fac13f4e005a3, 0x3feecf89dacfe68c,
+0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33,
+0x3c7d8aced7162e89, 0x3feed1b1231475f7,
+0xbc903d5cbe27874b, 0x3feed2c980460ad8,
+0xbc848f50cea7269f, 0x3feed3e504f696b1,
+0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
+0x3c821eb9a08a0542, 0x3feed625893523d4,
+0x3c5986178980fce0, 0x3feed74a8af46052,
+0xbc6133a953131cfd, 0x3feed872b8950a73,
+0x3c90cc319cee31d2, 0x3feed99e1330b358,
+0x3c89e95e6f4a0ae4, 0x3feedacc9be14dca,
+0xbc89472975b1f2a5, 0x3feedbfe53c12e59,
+0xbc90260cf07cb311, 0x3feedd333beb0b7e,
+0x3c8469846e735ab3, 0x3feede6b5579fdbf,
+0x3c1bca400a7b939d, 0x3feedfa6a1897fd2,
+0x3c7d8157a34b7e7f, 0x3feee0e521356eba,
+0x3c9140bc34dfc19f, 0x3feee226d59a09ee,
+0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a,
+0xbc8c9b1da461ab87, 0x3feee4b3e100301e,
+0x3c8c8a4e231ebb7d, 0x3feee5ff3a3c2774,
+0x3c8c115f23ebea8e, 0x3feee74dcca5a413,
+0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
+0xbc6dcab99f23f84e, 0x3feee9f4a17a4735,
+0xbc888c8d11a142e5, 0x3feeeb4ce622f2ff,
+0x3c60a43e8b7e4bfe, 0x3feeeca868742ee4,
+0xbc907b8f4ad1d9fa, 0x3feeee07298db666,
+0x3c915b1397075f04, 0x3feeef692a8fa8cd,
+0x3c889c2ea41433c7, 0x3feef0ce6c9a8952,
+0xbc839f7a1f04d2b0, 0x3feef236f0cf3f3a,
+0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
+0xbc86a510f31e13e6, 0x3feef511c43bbd62,
+0xbc7274aedac8ff80, 0x3feef68415b749b1,
+0xbc92887ea88e7340, 0x3feef7f9ade433c6,
+0xbc90a40e3da6f640, 0x3feef9728de5593a,
+0xbc6e57ac604759ba, 0x3feefaeeb6ddfc87,
+0x3c85c620ce76df06, 0x3feefc6e29f1c52a,
+0x3c8e6c6db4f83226, 0x3feefdf0e844bfc6,
+0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
+0xbc8d1bf10460dba0, 0x3fef01004b3a7804,
+0xbc8fda52e1b51e41, 0x3fef028cf22749e4,
+0x3c8e5d80813dddfc, 0x3fef041ce8e77680,
+0xbc91eee26b588a35, 0x3fef05b030a1064a,
+0x3c8caff9640f2dcb, 0x3fef0746ca7a67a7,
+0xbc32141a7b3e2cd8, 0x3fef08e0b79a6f1f,
+0x3c7a77557fd62db3, 0x3fef0a7df9285775,
+0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
+0xbc651ba6128db749, 0x3fef0dc27e2cb5e5,
+0xbc302899507554e5, 0x3fef0f69c3f3a207,
+0xbc7c0ffefdc5e251, 0x3fef111462c95b60,
+0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09,
+0xbc8b6cd058bfd6fa, 0x3fef1473b0468d30,
+0xbc80dda2d4c0010c, 0x3fef16286141b33d,
+0x3c923759b8aca76d, 0x3fef17e06ff301f4,
+0x3c736eae30af0cb3, 0x3fef199bdd85529c,
+0xbc895498a73dac7d, 0x3fef1b5aab23e61e,
+0xbc8a007daadf8d68, 0x3fef1d1cd9fa652c,
+0x3c851de924583108, 0x3fef1ee26b34e065,
+0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a,
+0xbc8c5fe4051ba06c, 0x3fef2277b9881650,
+0x3c836909391181d3, 0x3fef244778fafb22,
+0xbc6d1816c0a9ac07, 0x3fef261a9f8630ad,
+0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
+0xbc7af5c67c4e8235, 0x3fef29cb269e601f,
+0xbc811cd7dbdf9547, 0x3fef2ba88988c933,
+0xbc8304ef0045d575, 0x3fef2d89584661a1,
+0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5,
+0x3c8725f94f910375, 0x3fef31553dfa8313,
+0xbc7ac28b7bef6621, 0x3fef33405751c4db,
+0x3c7b53e99f9191e8, 0x3fef352ee13da7cb,
+0x3c676b2c6c921968, 0x3fef3720dcef9069,
+0xbc810a79e6d7e2b8, 0x3fef39164b994d23,
+0xbc7030587207b9e1, 0x3fef3b0f2e6d1675,
+0x3c840635f6d2a9c0, 0x3fef3d0b869d8f0f,
+0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa,
+0x3c549eeef9ec910c, 0x3fef410e9be12cb9,
+0xbc8cc734592af7fc, 0x3fef43155b5bab74,
+0xbc8335827ffb9dce, 0x3fef451f95018d17,
+0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
+0x3c645563980ef762, 0x3fef493e7ba2c38c,
+0x3c87752a44f587e8, 0x3fef4b532b08c968,
+0xbc8cd0205eb2aab2, 0x3fef4d6b596f948c,
+0xbc900dae3875a949, 0x3fef4f87080d89f2,
+0xbc8aab80ceab2b4a, 0x3fef51a638197a3c,
+0x3c85b66fefeef52e, 0x3fef53c8eacaa1d6,
+0xbc8f870f40a8ba1b, 0x3fef55ef2158a91f,
+0x3c74a385a63d07a7, 0x3fef5818dcfba487,
+0x3c83c119f18464c5, 0x3fef5a461eec14be,
+0x3c5159d9d908a96e, 0x3fef5c76e862e6d3,
+0xbc5a628c2be4e7c7, 0x3fef5eab3a99745b,
+0xbc82919e2040220f, 0x3fef60e316c98398,
+0xbc72550d76be719a, 0x3fef631e7e2d479d,
+0x3c8c254d16117a68, 0x3fef655d71ff6075,
+0xbc82090274667d12, 0x3fef679ff37adb4a,
+0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
+0x3c75f7d28150cac4, 0x3fef6c2fa45c4dfd,
+0xbc8d8c329fbd0e03, 0x3fef6e7cd63a8315,
+0x3c890de9296f4cd1, 0x3fef70cd9ab294e4,
+0x3c843a59ac016b4b, 0x3fef7321f301b460,
+0x3c832ff9978b34bc, 0x3fef7579e065807d,
+0xbc8ea6e6fbd5f2a6, 0x3fef77d5641c0658,
+0xbc7303b63dda1980, 0x3fef7a347f63c159,
+0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
+0xbc81f2ba385f2f95, 0x3fef7efd81a2ece1,
+0xbc63e8e3eab2cbb4, 0x3fef81676b197d17,
+0x3c768d9144ae12fc, 0x3fef83d4f11f8220,
+0xbc892ab93b470dc9, 0x3fef864614f5a129,
+0x3c853687f542403b, 0x3fef88bad7dcee90,
+0xbc8b7966cd0d2cd9, 0x3fef8b333b16ee12,
+0xbc736ed2de40b407, 0x3fef8daf3fe592e8,
+0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
+0xbc614ef56c770f3b, 0x3fef92b2334ac7ee,
+0xbc776caa4c2ff1cf, 0x3fef953924676d76,
+0x3c8df7d1353d8e88, 0x3fef97c3bc24e350,
+0x3c83c5ec519d7271, 0x3fef9a51fbc74c83,
+0xbc850bed64091b8a, 0x3fef9ce3e4933c7e,
+0xbc81d5fc525d9940, 0x3fef9f7977cdb740,
+0x3c89d852381c317f, 0x3fefa212b6bc3181,
+0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
+0x3c68a00e3cca04c4, 0x3fefa7503ccd2be5,
+0x3c855cd8aaea3d21, 0x3fefa9f4867cca6e,
+0xbc5a1f25ce94cae7, 0x3fefac9c80faa594,
+0xbc8dae98e223747d, 0x3fefaf482d8e67f1,
+0xbc6fb5f3ee307976, 0x3fefb1f78d802dc2,
+0x3c8269947c2bed4a, 0x3fefb4aaa2188510,
+0x3c737e8ae802b851, 0x3fefb7616ca06dd6,
+0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
+0x3c875119560e34af, 0x3fefbcda28a52e59,
+0xbc83b6137e9afe9e, 0x3fefbf9c1cb6412a,
+0xbc7431c3840929c6, 0x3fefc261cbdf5be7,
+0x3c842b94c3a9eb32, 0x3fefc52b376bba97,
+0xbc8cb472d2e86b99, 0x3fefc7f860a70c22,
+0xbc69fa74878ba7c7, 0x3fefcac948dd7274,
+0x3c83f5df2fde16a8, 0x3fefcd9df15b82ac,
+0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
+0x3c8eef18336b62e3, 0x3fefd35288633625,
+0x3c901f3a75ee0efe, 0x3fefd632798844f8,
+0x3c80d23f87b50a2a, 0x3fefd916302bd526,
+0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14,
+0x3c8302dee657c8e6, 0x3fefdee8f32a4b45,
+0xbc516a9ce6ed84fa, 0x3fefe1d802243c89,
+0xbc7b0caa080df170, 0x3fefe4cadbdac61d,
+0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
+0x3c7617a9f2fd24e5, 0x3fefeabbf4c0ba54,
+0xbc699c7db2effc76, 0x3fefedba3692d514,
+0x3c75f103b8fd5ca7, 0x3feff0bc4866e8ad,
+0x3c5305c14160cc89, 0x3feff3c22b8f71f1,
+0x3c8e70b094fa075a, 0x3feff6cbe15f6314,
+0x3c64b458677f9840, 0x3feff9d96b2a23d9,
+0xbc72ec9a3e5d680a, 0x3feffceaca4391b6,
+#endif
+},
+};
diff --git a/pl/math/expf.c b/pl/math/expf.c
new file mode 100644
index 0000000..c325e45
--- /dev/null
+++ b/pl/math/expf.c
@@ -0,0 +1,76 @@
+/*
+ * Single-precision e^x function.
+ *
+ * Copyright (c) 2017-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include <math.h>
+#include <stdint.h>
+#include "math_config.h"
+
+/*
+EXPF_TABLE_BITS = 5
+EXPF_POLY_ORDER = 3
+
+ULP error: 0.502 (nearest rounding.)
+Relative error: 1.69 * 2^-34 in [-ln2/64, ln2/64] (before rounding.)
+Wrong count: 170635 (all nearest rounding wrong results with fma.)
+Non-nearest ULP error: 1 (rounded ULP error)
+*/
+
+#define N (1 << EXPF_TABLE_BITS)
+#define InvLn2N __expf_data.invln2_scaled
+#define T __expf_data.tab
+#define C __expf_data.poly_scaled
+
+static inline uint32_t
+top12 (float x)
+{
+  return asuint (x) >> 20;
+}
+
+float
+optr_aor_exp_f32 (float x)
+{
+  uint32_t abstop;
+  uint64_t ki, t;
+  /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
+  double_t kd, xd, z, r, r2, y, s;
+
+  xd = (double_t) x;
+  abstop = top12 (x) & 0x7ff;
+  if (unlikely (abstop >= top12 (88.0f)))
+    {
+      /* |x| >= 88 or x is nan.  */
+      if (asuint (x) == asuint (-INFINITY))
+	return 0.0f;
+      if (abstop >= top12 (INFINITY))
+	return x + x;
+      if (x > 0x1.62e42ep6f) /* x > log(0x1p128) ~= 88.72 */
+	return __math_oflowf (0);
+      if (x < -0x1.9fe368p6f) /* x < log(0x1p-150) ~= -103.97 */
+	return __math_uflowf (0);
+    }
+
+  /* x*N/Ln2 = k + r with r in [-1/2, 1/2] and int k.  */
+  z = InvLn2N * xd;
+
+  /* Round and convert z to int, the result is in [-150*N, 128*N] and
+     ideally nearest int is used, otherwise the magnitude of r can be
+     bigger which gives larger approximation error.  */
+  kd = roundtoint (z);
+  ki = converttoint (z);
+  r = z - kd;
+
+  /* exp(x) = 2^(k/N) * 2^(r/N) ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */
+  t = T[ki % N];
+  t += ki << (52 - EXPF_TABLE_BITS);
+  s = asdouble (t);
+  z = C[0] * r + C[1];
+  r2 = r * r;
+  y = C[2] * r + 1;
+  y = z * r2 + y;
+  y = y * s;
+  return eval_as_float (y);
+}
diff --git a/pl/math/expf_data.c b/pl/math/expf_data.c
new file mode 100644
index 0000000..474ad57
--- /dev/null
+++ b/pl/math/expf_data.c
@@ -0,0 +1,31 @@
+/*
+ * Coeffs and table entries for single-precision exp. Copied from
+ * math/exp2f_data.c, with EXP2F_TABLE_BITS == 32.
+ *
+ * Copyright (c) 2017-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define N (1 << EXPF_TABLE_BITS)
+
+const struct expf_data __expf_data = {
+  /* tab[i] = uint(2^(i/N)) - (i << 52-BITS)
+     used for computing 2^(k/N) for an int |k| < 150 N as
+     double(tab[k%N] + (k << 52-BITS)) */
+  .tab = {
+0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51,
+0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1,
+0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
+0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585,
+0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13,
+0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
+0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069,
+0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,
+  },
+  .invln2_scaled = 0x1.71547652b82fep+0 * N,
+  .poly_scaled = {
+  0x1.c6af84b912394p-5/N/N/N, 0x1.ebfce50fac4f3p-3/N/N, 0x1.62e42ff0c52d6p-1/N,
+  },
+};
diff --git a/pl/math/expm1_2u5.c b/pl/math/expm1_2u5.c
new file mode 100644
index 0000000..a3faff7
--- /dev/null
+++ b/pl/math/expm1_2u5.c
@@ -0,0 +1,86 @@
+/*
+ * Double-precision e^x - 1 function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "estrin.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define InvLn2 0x1.71547652b82fep0
+#define Ln2hi 0x1.62e42fefa39efp-1
+#define Ln2lo 0x1.abc9e3b39803fp-56
+#define Shift 0x1.8p52
+#define TinyBound                                                              \
+  0x3cc0000000000000 /* 0x1p-51, below which expm1(x) is within 2 ULP of x. */
+#define BigBound 0x1.63108c75a1937p+9  /* Above which expm1(x) overflows.  */
+#define NegBound -0x1.740bf7c0d927dp+9 /* Below which expm1(x) rounds to 1. */
+#define AbsMask 0x7fffffffffffffff
+
+#define C(i) __expm1_poly[i]
+
+/* Approximation for exp(x) - 1 using polynomial on a reduced interval.
+   The maximum error observed error is 2.17 ULP:
+   expm1(0x1.63f90a866748dp-2) got 0x1.a9af56603878ap-2
+			      want 0x1.a9af566038788p-2.  */
+double
+expm1 (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t ax = ix & AbsMask;
+
+  /* Tiny, +Infinity.  */
+  if (ax <= TinyBound || ix == 0x7ff0000000000000)
+    return x;
+
+  /* +/-NaN.  */
+  if (ax > 0x7ff0000000000000)
+    return __math_invalid (x);
+
+  /* Result is too large to be represented as a double.  */
+  if (x >= 0x1.63108c75a1937p+9)
+    return __math_oflow (0);
+
+  /* Result rounds to -1 in double precision.  */
+  if (x <= NegBound)
+    return -1;
+
+  /* Reduce argument to smaller range:
+     Let i = round(x / ln2)
+     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where 2^i is exact because i is an integer.  */
+  double j = fma (InvLn2, x, Shift) - Shift;
+  int64_t i = j;
+  double f = fma (j, -Ln2hi, x);
+  f = fma (j, -Ln2lo, f);
+
+  /* Approximate expm1(f) using polynomial.
+     Taylor expansion for expm1(x) has the form:
+	 x + ax^2 + bx^3 + cx^4 ....
+     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+  double f2 = f * f;
+  double f4 = f2 * f2;
+  double p = fma (f2, ESTRIN_10 (f, f2, f4, f4 * f4, C), f);
+
+  /* Assemble the result, using a slight rearrangement to achieve acceptable
+     accuracy.
+     expm1(x) ~= 2^i * (p + 1) - 1
+     Let t = 2^(i - 1).  */
+  double t = ldexp (0.5, i);
+  /* expm1(x) ~= 2 * (p * t + (t - 1/2)).  */
+  return 2 * fma (p, t, t - 0.5);
+}
+
+PL_SIG (S, D, 1, expm1, -9.9, 9.9)
+PL_TEST_ULP (expm1, 1.68)
+PL_TEST_INTERVAL (expm1, 0, 0x1p-51, 1000)
+PL_TEST_INTERVAL (expm1, -0, -0x1p-51, 1000)
+PL_TEST_INTERVAL (expm1, 0x1p-51, 0x1.63108c75a1937p+9, 100000)
+PL_TEST_INTERVAL (expm1, -0x1p-51, -0x1.740bf7c0d927dp+9, 100000)
+PL_TEST_INTERVAL (expm1, 0x1.63108c75a1937p+9, inf, 100)
+PL_TEST_INTERVAL (expm1, -0x1.740bf7c0d927dp+9, -inf, 100)
diff --git a/pl/math/expm1_data.c b/pl/math/expm1_data.c
new file mode 100644
index 0000000..ff7426b
--- /dev/null
+++ b/pl/math/expm1_data.c
@@ -0,0 +1,21 @@
+/*
+ * Coefficients for double-precision e^x - 1 function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Generated using fpminimax, see tools/expm1.sollya for details.  */
+const double __expm1_poly[] = {0x1p-1,
+			       0x1.5555555555559p-3,
+			       0x1.555555555554bp-5,
+			       0x1.111111110f663p-7,
+			       0x1.6c16c16c1b5f3p-10,
+			       0x1.a01a01affa35dp-13,
+			       0x1.a01a018b4ecbbp-16,
+			       0x1.71ddf82db5bb4p-19,
+			       0x1.27e517fc0d54bp-22,
+			       0x1.af5eedae67435p-26,
+			       0x1.1f143d060a28ap-29};
diff --git a/pl/math/expm1f_1u6.c b/pl/math/expm1f_1u6.c
new file mode 100644
index 0000000..70b14e4
--- /dev/null
+++ b/pl/math/expm1f_1u6.c
@@ -0,0 +1,80 @@
+/*
+ * Single-precision e^x - 1 function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "hornerf.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define Shift (0x1.8p23f)
+#define InvLn2 (0x1.715476p+0f)
+#define Ln2hi (0x1.62e4p-1f)
+#define Ln2lo (0x1.7f7d1cp-20f)
+#define AbsMask (0x7fffffff)
+#define InfLimit                                                               \
+  (0x1.644716p6) /* Smallest value of x for which expm1(x) overflows.  */
+#define NegLimit                                                               \
+  (-0x1.9bbabcp+6) /* Largest value of x for which expm1(x) rounds to 1.  */
+
+#define C(i) __expm1f_poly[i]
+
+/* Approximation for exp(x) - 1 using polynomial on a reduced interval.
+   The maximum error is 1.51 ULP:
+   expm1f(0x1.8baa96p-2) got 0x1.e2fb9p-2
+			want 0x1.e2fb94p-2.  */
+float
+expm1f (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t ax = ix & AbsMask;
+
+  /* Tiny: |x| < 0x1p-23. expm1(x) is closely approximated by x.
+     Inf:  x == +Inf => expm1(x) = x.  */
+  if (ax <= 0x34000000 || (ix == 0x7f800000))
+    return x;
+
+  /* +/-NaN.  */
+  if (ax > 0x7f800000)
+    return __math_invalidf (x);
+
+  if (x >= InfLimit)
+    return __math_oflowf (0);
+
+  if (x <= NegLimit || ix == 0xff800000)
+    return -1;
+
+  /* Reduce argument to smaller range:
+     Let i = round(x / ln2)
+     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where 2^i is exact because i is an integer.  */
+  float j = fmaf (InvLn2, x, Shift) - Shift;
+  int32_t i = j;
+  float f = fmaf (j, -Ln2hi, x);
+  f = fmaf (j, -Ln2lo, f);
+
+  /* Approximate expm1(f) using polynomial.
+     Taylor expansion for expm1(x) has the form:
+	 x + ax^2 + bx^3 + cx^4 ....
+     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+  float p = fmaf (f * f, HORNER_4 (f, C), f);
+  /* Assemble the result, using a slight rearrangement to achieve acceptable
+     accuracy.
+     expm1(x) ~= 2^i * (p + 1) - 1
+     Let t = 2^(i - 1).  */
+  float t = ldexpf (0.5f, i);
+  /* expm1(x) ~= 2 * (p * t + (t - 1/2)).  */
+  return 2 * fmaf (p, t, t - 0.5f);
+}
+
+PL_SIG (S, F, 1, expm1, -9.9, 9.9)
+PL_TEST_ULP (expm1f, 1.02)
+PL_TEST_INTERVAL (expm1f, 0, 0x1p-23, 1000)
+PL_TEST_INTERVAL (expm1f, -0, -0x1p-23, 1000)
+PL_TEST_INTERVAL (expm1f, 0x1p-23, 0x1.644716p6, 100000)
+PL_TEST_INTERVAL (expm1f, -0x1p-23, -0x1.9bbabcp+6, 100000)
diff --git a/pl/math/expm1f_data.c b/pl/math/expm1f_data.c
new file mode 100644
index 0000000..9d02dc4
--- /dev/null
+++ b/pl/math/expm1f_data.c
@@ -0,0 +1,12 @@
+/*
+ * Coefficients for single-precision e^x - 1 function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Generated using fpminimax, see tools/expm1f.sollya for details.  */
+const float __expm1f_poly[] = {0x1.fffffep-2, 0x1.5554aep-3, 0x1.555736p-5,
+			       0x1.12287cp-7, 0x1.6b55a2p-10};
diff --git a/pl/math/horner.h b/pl/math/horner.h
new file mode 100644
index 0000000..f92ab67
--- /dev/null
+++ b/pl/math/horner.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for single-precision Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f64
+#else
+#define FMA fma
+#endif
+
+#include "horner_wrap.h"
diff --git a/pl/math/horner_wrap.h b/pl/math/horner_wrap.h
new file mode 100644
index 0000000..6478968
--- /dev/null
+++ b/pl/math/horner_wrap.h
@@ -0,0 +1,34 @@
+/*
+ * Helper macros for Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+// clang-format off
+#define  HORNER_1_(x, c, i) FMA(c(i + 1), x, c(i))
+#define  HORNER_2_(x, c, i) FMA(HORNER_1_ (x, c, i + 1), x, c(i))
+#define  HORNER_3_(x, c, i) FMA(HORNER_2_ (x, c, i + 1), x, c(i))
+#define  HORNER_4_(x, c, i) FMA(HORNER_3_ (x, c, i + 1), x, c(i))
+#define  HORNER_5_(x, c, i) FMA(HORNER_4_ (x, c, i + 1), x, c(i))
+#define  HORNER_6_(x, c, i) FMA(HORNER_5_ (x, c, i + 1), x, c(i))
+#define  HORNER_7_(x, c, i) FMA(HORNER_6_ (x, c, i + 1), x, c(i))
+#define  HORNER_8_(x, c, i) FMA(HORNER_7_ (x, c, i + 1), x, c(i))
+#define  HORNER_9_(x, c, i) FMA(HORNER_8_ (x, c, i + 1), x, c(i))
+#define HORNER_10_(x, c, i) FMA(HORNER_9_ (x, c, i + 1), x, c(i))
+#define HORNER_11_(x, c, i) FMA(HORNER_10_(x, c, i + 1), x, c(i))
+#define HORNER_12_(x, c, i) FMA(HORNER_11_(x, c, i + 1), x, c(i))
+
+#define  HORNER_1(x, c) HORNER_1_ (x, c, 0)
+#define  HORNER_2(x, c) HORNER_2_ (x, c, 0)
+#define  HORNER_3(x, c) HORNER_3_ (x, c, 0)
+#define  HORNER_4(x, c) HORNER_4_ (x, c, 0)
+#define  HORNER_5(x, c) HORNER_5_ (x, c, 0)
+#define  HORNER_6(x, c) HORNER_6_ (x, c, 0)
+#define  HORNER_7(x, c) HORNER_7_ (x, c, 0)
+#define  HORNER_8(x, c) HORNER_8_ (x, c, 0)
+#define  HORNER_9(x, c) HORNER_9_ (x, c, 0)
+#define HORNER_10(x, c) HORNER_10_(x, c, 0)
+#define HORNER_11(x, c) HORNER_11_(x, c, 0)
+#define HORNER_12(x, c) HORNER_12_(x, c, 0)
+// clang-format on
diff --git a/pl/math/hornerf.h b/pl/math/hornerf.h
new file mode 100644
index 0000000..0703817
--- /dev/null
+++ b/pl/math/hornerf.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for double-precision Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f32
+#else
+#define FMA fmaf
+#endif
+
+#include "horner_wrap.h"
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
new file mode 100644
index 0000000..af5f9f9
--- /dev/null
+++ b/pl/math/include/mathlib.h
@@ -0,0 +1,244 @@
+// clang-format off
+/*
+ * Public API.
+ *
+ * Copyright (c) 2015-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _MATHLIB_H
+#define _MATHLIB_H
+
+float acoshf (float);
+float asinhf (float);
+float atan2f (float, float);
+float atanf (float);
+float atanhf (float);
+float cbrtf (float);
+float coshf (float);
+float erfcf (float);
+float erff (float);
+float expm1f (float);
+float log10f (float);
+float log1pf (float);
+float sinhf (float);
+float tanf (float);
+float tanhf (float);
+
+double acosh (double);
+double asinh (double);
+double atan (double);
+double atan2 (double, double);
+double atanh (double);
+double cbrt (double);
+double cosh (double);
+double erfc (double);
+double expm1 (double);
+double log10 (double);
+double log1p (double);
+double sinh (double);
+double tanh (double);
+
+float __s_acoshf (float);
+float __s_asinhf (float);
+float __s_atanf (float);
+float __s_atan2f (float, float);
+float __s_atanhf (float);
+float __s_cbrtf (float);
+float __s_coshf (float);
+float __s_erfcf (float);
+float __s_erff (float);
+float __s_expm1f (float);
+float __s_log10f (float);
+float __s_log1pf (float);
+float __s_log2f (float);
+float __s_sinhf (float);
+float __s_tanf (float);
+float __s_tanhf (float);
+
+double __s_acosh (double);
+double __s_asinh (double);
+double __s_atan (double);
+double __s_atan2 (double, double);
+double __s_atanh (double);
+double __s_cbrt (double);
+double __s_cosh (double);
+double __s_erf (double);
+double __s_erfc (double);
+double __s_expm1 (double);
+double __s_log10 (double);
+double __s_log1p (double);
+double __s_log2 (double);
+double __s_sinh (double);
+double __s_tan (double);
+double __s_tanh (double);
+
+#if __aarch64__
+#if __GNUC__ >= 5
+typedef __Float32x4_t __f32x4_t;
+typedef __Float64x2_t __f64x2_t;
+#elif __clang_major__*100+__clang_minor__ >= 305
+typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t;
+typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
+#else
+#error Unsupported compiler
+#endif
+
+/* Vector functions following the base PCS.  */
+__f32x4_t __v_acoshf (__f32x4_t);
+__f64x2_t __v_acosh (__f64x2_t);
+__f32x4_t __v_asinhf (__f32x4_t);
+__f64x2_t __v_asinh (__f64x2_t);
+__f32x4_t __v_atanf (__f32x4_t);
+__f64x2_t __v_atan (__f64x2_t);
+__f32x4_t __v_atan2f (__f32x4_t, __f32x4_t);
+__f64x2_t __v_atan2 (__f64x2_t, __f64x2_t);
+__f32x4_t __v_atanhf (__f32x4_t);
+__f64x2_t __v_atanh (__f64x2_t);
+__f32x4_t __v_cbrtf (__f32x4_t);
+__f64x2_t __v_cbrt (__f64x2_t);
+__f32x4_t __v_coshf (__f32x4_t);
+__f64x2_t __v_cosh (__f64x2_t);
+__f32x4_t __v_erff (__f32x4_t);
+__f64x2_t __v_erf (__f64x2_t);
+__f32x4_t __v_erfcf (__f32x4_t);
+__f64x2_t __v_erfc (__f64x2_t);
+__f32x4_t __v_expm1f (__f32x4_t);
+__f64x2_t __v_expm1 (__f64x2_t);
+__f32x4_t __v_log10f (__f32x4_t);
+__f64x2_t __v_log10 (__f64x2_t);
+__f32x4_t __v_log1pf (__f32x4_t);
+__f64x2_t __v_log1p (__f64x2_t);
+__f32x4_t __v_log2f (__f32x4_t);
+__f64x2_t __v_log2 (__f64x2_t);
+__f32x4_t __v_sinhf (__f32x4_t);
+__f64x2_t __v_sinh (__f64x2_t);
+__f32x4_t __v_tanf (__f32x4_t);
+__f64x2_t __v_tan (__f64x2_t);
+__f32x4_t __v_tanhf (__f32x4_t);
+__f64x2_t __v_tanh (__f64x2_t);
+
+#if __GNUC__ >= 9 || __clang_major__ >= 8
+#define __vpcs __attribute__((__aarch64_vector_pcs__))
+
+/* Vector functions following the vector PCS.  */
+__vpcs __f32x4_t __vn_acoshf (__f32x4_t);
+__vpcs __f64x2_t __vn_acosh (__f64x2_t);
+__vpcs __f32x4_t __vn_asinhf (__f32x4_t);
+__vpcs __f64x2_t __vn_asinh (__f64x2_t);
+__vpcs __f32x4_t __vn_atanf (__f32x4_t);
+__vpcs __f64x2_t __vn_atan (__f64x2_t);
+__vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t);
+__vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t);
+__vpcs __f32x4_t __vn_atanhf (__f32x4_t);
+__vpcs __f64x2_t __vn_atanh (__f64x2_t);
+__vpcs __f32x4_t __vn_cbrtf (__f32x4_t);
+__vpcs __f64x2_t __vn_cbrt (__f64x2_t);
+__vpcs __f32x4_t __vn_coshf (__f32x4_t);
+__vpcs __f64x2_t __vn_cosh (__f64x2_t);
+__vpcs __f32x4_t __vn_erff (__f32x4_t);
+__vpcs __f64x2_t __vn_erf (__f64x2_t);
+__vpcs __f32x4_t __vn_erfcf (__f32x4_t);
+__vpcs __f64x2_t __vn_erfc (__f64x2_t);
+__vpcs __f32x4_t __vn_expm1f (__f32x4_t);
+__vpcs __f64x2_t __vn_expm1 (__f64x2_t);
+__vpcs __f32x4_t __vn_log10f (__f32x4_t);
+__vpcs __f64x2_t __vn_log10 (__f64x2_t);
+__vpcs __f32x4_t __vn_log1pf (__f32x4_t);
+__vpcs __f64x2_t __vn_log1p (__f64x2_t);
+__vpcs __f32x4_t __vn_log2f (__f32x4_t);
+__vpcs __f64x2_t __vn_log2 (__f64x2_t);
+__vpcs __f32x4_t __vn_sinhf (__f32x4_t);
+__vpcs __f64x2_t __vn_sinh (__f64x2_t);
+__vpcs __f32x4_t __vn_tanf (__f32x4_t);
+__vpcs __f64x2_t __vn_tan (__f64x2_t);
+__vpcs __f32x4_t __vn_tanhf (__f32x4_t);
+__vpcs __f64x2_t __vn_tanh (__f64x2_t);
+
+/* Vector functions following the vector PCS using ABI names.  */
+__vpcs __f32x4_t _ZGVnN4v_acoshf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_acosh (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_asinh (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
+__vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_atanh (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_cbrtf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_cbrt (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_erfcf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_erfc (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_expm1f (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_expm1 (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_tanhf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_tanh (__f64x2_t);
+
+#endif
+
+#if WANT_SVE_MATH
+#include <arm_sve.h>
+svfloat32_t __sv_atan2f_x (svfloat32_t, svfloat32_t, svbool_t);
+svfloat32_t __sv_atanf_x (svfloat32_t, svbool_t);
+svfloat64_t __sv_atan_x (svfloat64_t, svbool_t);
+svfloat64_t __sv_atan2_x (svfloat64_t, svfloat64_t, svbool_t);
+svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t);
+svfloat64_t __sv_cos_x (svfloat64_t, svbool_t);
+svfloat32_t __sv_erff_x (svfloat32_t, svbool_t);
+svfloat64_t __sv_erf_x (svfloat64_t, svbool_t);
+svfloat64_t __sv_erfc_x (svfloat64_t, svbool_t);
+svfloat32_t __sv_expf_x (svfloat32_t, svbool_t);
+svfloat32_t __sv_logf_x (svfloat32_t, svbool_t);
+svfloat64_t __sv_log_x (svfloat64_t, svbool_t);
+svfloat32_t __sv_log10f_x (svfloat32_t, svbool_t);
+svfloat64_t __sv_log10_x (svfloat64_t, svbool_t);
+svfloat32_t __sv_log2f_x (svfloat32_t, svbool_t);
+svfloat64_t __sv_log2_x (svfloat64_t, svbool_t);
+svfloat32_t __sv_powif_x (svfloat32_t, svint32_t, svbool_t);
+svfloat64_t __sv_powi_x (svfloat64_t, svint64_t, svbool_t);
+svfloat32_t __sv_sinf_x (svfloat32_t, svbool_t);
+svfloat64_t __sv_sin_x (svfloat64_t, svbool_t);
+svfloat32_t __sv_tanf_x (svfloat32_t, svbool_t);
+/* SVE ABI names.  */
+svfloat32_t _ZGVsMxvv_atan2f (svfloat32_t, svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_atanf (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_atan (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxvv_atan2 (svfloat64_t, svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_erff (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_erf (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_erfc (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_expf (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_logf (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_log (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_log10f (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_log10 (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_log2f (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_log2 (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxvv_powi(svfloat32_t, svint32_t, svbool_t);
+svfloat64_t _ZGVsMxvv_powk(svfloat64_t, svint64_t, svbool_t);
+svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_sin (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_tanf (svfloat32_t, svbool_t);
+#endif
+
+#endif
+
+#endif
+// clang-format on
diff --git a/pl/math/include/pl_test.h b/pl/math/include/pl_test.h
new file mode 100644
index 0000000..6a81360
--- /dev/null
+++ b/pl/math/include/pl_test.h
@@ -0,0 +1,26 @@
+/*
+ * PL macros to aid testing. This version of this file is used for building the
+ * routine, not the tests. Separate definitions are found in test/pl_test.h
+ * which emit test parameters.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception.
+ */
+
+/* Emit max ULP threshold - silenced for building the routine.  */
+#define PL_TEST_ULP(f, l)
+
+/* Emit alias. The PL_TEST_ALIAS declaration is piggy-backed on top of
+   strong_alias. Use PL_ALIAS instead of strong_alias to make sure the alias is
+   also added to the test suite.  */
+#define PL_ALIAS(a, b) strong_alias (a, b)
+
+/* Emit routine name if e == 1 and f is expected to correctly trigger fenv
+   exceptions. e allows declaration to be emitted conditionally upon certain
+   build flags - defer expansion by one pass to allow those flags to be expanded
+   properly.  */
+#define PL_TEST_EXPECT_FENV(f, e)
+#define PL_TEST_EXPECT_FENV_ALWAYS(f)
+
+#define PL_TEST_INTERVAL(f, lo, hi, n)
+#define PL_TEST_INTERVAL_C(f, lo, hi, n, c)
diff --git a/pl/math/log.c b/pl/math/log.c
new file mode 100644
index 0000000..40b0441
--- /dev/null
+++ b/pl/math/log.c
@@ -0,0 +1,161 @@
+/*
+ * Double-precision log(x) function.
+ *
+ * Copyright (c) 2018-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include <float.h>
+#include <math.h>
+#include <stdint.h>
+#include "math_config.h"
+
+#define T __log_data.tab
+#define T2 __log_data.tab2
+#define B __log_data.poly1
+#define A __log_data.poly
+#define Ln2hi __log_data.ln2hi
+#define Ln2lo __log_data.ln2lo
+#define N (1 << LOG_TABLE_BITS)
+#define OFF 0x3fe6000000000000
+
+/* Top 16 bits of a double.  */
+static inline uint32_t
+top16 (double x)
+{
+  return asuint64 (x) >> 48;
+}
+
+double
+optr_aor_log_f64 (double x)
+{
+  /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
+  double_t w, z, r, r2, r3, y, invc, logc, kd, hi, lo;
+  uint64_t ix, iz, tmp;
+  uint32_t top;
+  int k, i;
+
+  ix = asuint64 (x);
+  top = top16 (x);
+
+#if LOG_POLY1_ORDER == 10 || LOG_POLY1_ORDER == 11
+#define LO asuint64 (1.0 - 0x1p-5)
+#define HI asuint64 (1.0 + 0x1.1p-5)
+#elif LOG_POLY1_ORDER == 12
+#define LO asuint64 (1.0 - 0x1p-4)
+#define HI asuint64 (1.0 + 0x1.09p-4)
+#endif
+  if (unlikely (ix - LO < HI - LO))
+    {
+      /* Handle close to 1.0 inputs separately.  */
+      /* Fix sign of zero with downward rounding when x==1.  */
+      if (WANT_ROUNDING && unlikely (ix == asuint64 (1.0)))
+	return 0;
+      r = x - 1.0;
+      r2 = r * r;
+      r3 = r * r2;
+#if LOG_POLY1_ORDER == 10
+      /* Worst-case error is around 0.516 ULP.  */
+      y = r3
+	  * (B[1] + r * B[2] + r2 * B[3]
+	     + r3 * (B[4] + r * B[5] + r2 * B[6] + r3 * (B[7] + r * B[8])));
+      w = B[0] * r2; /* B[0] == -0.5.  */
+      hi = r + w;
+      y += r - hi + w;
+      y += hi;
+#elif LOG_POLY1_ORDER == 11
+      /* Worst-case error is around 0.516 ULP.  */
+      y = r3
+	  * (B[1] + r * B[2]
+	     + r2
+		 * (B[3] + r * B[4] + r2 * B[5]
+		    + r3 * (B[6] + r * B[7] + r2 * B[8] + r3 * B[9])));
+      w = B[0] * r2; /* B[0] == -0.5.  */
+      hi = r + w;
+      y += r - hi + w;
+      y += hi;
+#elif LOG_POLY1_ORDER == 12
+      y = r3
+	  * (B[1] + r * B[2] + r2 * B[3]
+	     + r3
+		 * (B[4] + r * B[5] + r2 * B[6]
+		    + r3 * (B[7] + r * B[8] + r2 * B[9] + r3 * B[10])));
+#if N <= 64
+      /* Worst-case error is around 0.532 ULP.  */
+      w = B[0] * r2; /* B[0] == -0.5.  */
+      hi = r + w;
+      y += r - hi + w;
+      y += hi;
+#else
+      /* Worst-case error is around 0.507 ULP.  */
+      w = r * 0x1p27;
+      double_t rhi = r + w - w;
+      double_t rlo = r - rhi;
+      w = rhi * rhi * B[0]; /* B[0] == -0.5.  */
+      hi = r + w;
+      lo = r - hi + w;
+      lo += B[0] * rlo * (rhi + r);
+      y += lo;
+      y += hi;
+#endif
+#endif
+      return eval_as_double (y);
+    }
+  if (unlikely (top - 0x0010 >= 0x7ff0 - 0x0010))
+    {
+      /* x < 0x1p-1022 or inf or nan.  */
+      if (ix * 2 == 0)
+	return __math_divzero (1);
+      if (ix == asuint64 (INFINITY)) /* log(inf) == inf.  */
+	return x;
+      if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0)
+	return __math_invalid (x);
+      /* x is subnormal, normalize it.  */
+      ix = asuint64 (x * 0x1p52);
+      ix -= 52ULL << 52;
+    }
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  tmp = ix - OFF;
+  i = (tmp >> (52 - LOG_TABLE_BITS)) % N;
+  k = (int64_t) tmp >> 52; /* arithmetic shift */
+  iz = ix - (tmp & 0xfffULL << 52);
+  invc = T[i].invc;
+  logc = T[i].logc;
+  z = asdouble (iz);
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+  /* r ~= z/c - 1, |r| < 1/(2*N).  */
+#if HAVE_FAST_FMA
+  /* rounding error: 0x1p-55/N.  */
+  r = fma (z, invc, -1.0);
+#else
+  /* rounding error: 0x1p-55/N + 0x1p-66.  */
+  r = (z - T2[i].chi - T2[i].clo) * invc;
+#endif
+  kd = (double_t) k;
+
+  /* hi + lo = r + log(c) + k*Ln2.  */
+  w = kd * Ln2hi + logc;
+  hi = w + r;
+  lo = w - hi + r + kd * Ln2lo;
+
+  /* log(x) = lo + (log1p(r) - r) + hi.  */
+  r2 = r * r; /* rounding error: 0x1p-54/N^2.  */
+  /* Worst case error if |y| > 0x1p-5:
+     0.5 + 4.13/N + abs-poly-error*2^57 ULP (+ 0.002 ULP without fma)
+     Worst case error if |y| > 0x1p-4:
+     0.5 + 2.06/N + abs-poly-error*2^56 ULP (+ 0.001 ULP without fma).  */
+#if LOG_POLY_ORDER == 6
+  y = lo + r2 * A[0] + r * r2 * (A[1] + r * A[2] + r2 * (A[3] + r * A[4])) + hi;
+#elif LOG_POLY_ORDER == 7
+  y = lo
+      + r2
+	  * (A[0] + r * A[1] + r2 * (A[2] + r * A[3])
+	     + r2 * r2 * (A[4] + r * A[5]))
+      + hi;
+#endif
+  return eval_as_double (y);
+}
diff --git a/pl/math/log10_2u.c b/pl/math/log10_2u.c
new file mode 100644
index 0000000..74828ea
--- /dev/null
+++ b/pl/math/log10_2u.c
@@ -0,0 +1,150 @@
+/*
+ * Double-precision log10(x) function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+/* Polynomial coefficients and lookup tables.  */
+#define T __log10_data.tab
+#define T2 __log10_data.tab2
+#define B __log10_data.poly1
+#define A __log10_data.poly
+#define Ln2hi __log10_data.ln2hi
+#define Ln2lo __log10_data.ln2lo
+#define InvLn10 __log10_data.invln10
+#define N (1 << LOG10_TABLE_BITS)
+#define OFF 0x3fe6000000000000
+#define LO asuint64 (1.0 - 0x1p-4)
+#define HI asuint64 (1.0 + 0x1.09p-4)
+
+/* Top 16 bits of a double.  */
+static inline uint32_t
+top16 (double x)
+{
+  return asuint64 (x) >> 48;
+}
+
+/* Fast and low accuracy implementation of log10.
+   The implementation is similar to that of math/log, except that:
+   - Polynomials are computed for log10(1+r) with r on same intervals as log.
+   - Lookup parameters are scaled (at runtime) to switch from base e to base 10.
+   Many errors above 1.59 ulp are observed across the whole range of doubles.
+   The greatest observed error is 1.61 ulp, at around 0.965:
+   log10(0x1.dc8710333a29bp-1) got -0x1.fee26884905a6p-6
+			      want -0x1.fee26884905a8p-6.  */
+double
+log10 (double x)
+{
+  /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
+  double_t w, z, r, r2, r3, y, invc, logc, kd, hi, lo;
+  uint64_t ix, iz, tmp;
+  uint32_t top;
+  int k, i;
+
+  ix = asuint64 (x);
+  top = top16 (x);
+
+  if (unlikely (ix - LO < HI - LO))
+    {
+      /* Handle close to 1.0 inputs separately.  */
+      /* Fix sign of zero with downward rounding when x==1.  */
+      if (WANT_ROUNDING && unlikely (ix == asuint64 (1.0)))
+	return 0;
+      r = x - 1.0;
+      r2 = r * r;
+      r3 = r * r2;
+      y = r3
+	  * (B[1] + r * B[2] + r2 * B[3]
+	     + r3
+		 * (B[4] + r * B[5] + r2 * B[6]
+		    + r3 * (B[7] + r * B[8] + r2 * B[9] + r3 * B[10])));
+      /* Worst-case error is around 0.507 ULP.  */
+      w = r * 0x1p27;
+      double_t rhi = r + w - w;
+      double_t rlo = r - rhi;
+      w = rhi * rhi * B[0];
+      hi = r + w;
+      lo = r - hi + w;
+      lo += B[0] * rlo * (rhi + r);
+      y += lo;
+      y += hi;
+      /* Scale by 1/ln(10). Polynomial already contains scaling.  */
+      y = y * InvLn10;
+
+      return eval_as_double (y);
+    }
+  if (unlikely (top - 0x0010 >= 0x7ff0 - 0x0010))
+    {
+      /* x < 0x1p-1022 or inf or nan.  */
+      if (ix * 2 == 0)
+	return __math_divzero (1);
+      if (ix == asuint64 (INFINITY)) /* log10(inf) == inf.  */
+	return x;
+      if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0)
+	return __math_invalid (x);
+      /* x is subnormal, normalize it.  */
+      ix = asuint64 (x * 0x1p52);
+      ix -= 52ULL << 52;
+    }
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  tmp = ix - OFF;
+  i = (tmp >> (52 - LOG10_TABLE_BITS)) % N;
+  k = (int64_t) tmp >> 52; /* arithmetic shift.  */
+  iz = ix - (tmp & 0xfffULL << 52);
+  invc = T[i].invc;
+  logc = T[i].logc;
+  z = asdouble (iz);
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+  /* r ~= z/c - 1, |r| < 1/(2*N).  */
+#if HAVE_FAST_FMA
+  /* rounding error: 0x1p-55/N.  */
+  r = fma (z, invc, -1.0);
+#else
+  /* rounding error: 0x1p-55/N + 0x1p-66.  */
+  r = (z - T2[i].chi - T2[i].clo) * invc;
+#endif
+  kd = (double_t) k;
+
+  /* w = log(c) + k*Ln2hi.  */
+  w = kd * Ln2hi + logc;
+  hi = w + r;
+  lo = w - hi + r + kd * Ln2lo;
+
+  /* log10(x) = (w + r)/log(10) + (log10(1+r) - r/log(10)).  */
+  r2 = r * r; /* rounding error: 0x1p-54/N^2.  */
+
+  /* Scale by 1/ln(10). Polynomial already contains scaling.  */
+  y = lo + r2 * A[0] + r * r2 * (A[1] + r * A[2] + r2 * (A[3] + r * A[4])) + hi;
+  y = y * InvLn10;
+
+  return eval_as_double (y);
+}
+
+// clang-format off
+#if USE_GLIBC_ABI
+strong_alias (log10, __log10_finite)
+hidden_alias (log10, __ieee754_log10)
+#if LDBL_MANT_DIG == 53
+long double
+log10l (long double x)
+{
+  return log10 (x);
+}
+#endif
+#endif
+// clang-format on
+
+PL_SIG (S, D, 1, log10, 0.01, 11.1)
+PL_TEST_ULP (log10, 1.11)
+PL_TEST_INTERVAL (log10, 0, 0xffff000000000000, 10000)
+PL_TEST_INTERVAL (log10, 0x1p-4, 0x1p4, 40000)
+PL_TEST_INTERVAL (log10, 0, inf, 40000)
diff --git a/pl/math/log10_data.c b/pl/math/log10_data.c
new file mode 100644
index 0000000..9976f19
--- /dev/null
+++ b/pl/math/log10_data.c
@@ -0,0 +1,337 @@
+/*
+ * Data for log10.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define N (1 << LOG10_TABLE_BITS)
+
+const struct log10_data __log10_data = {
+.ln2hi = 0x1.62e42fefa3800p-1,
+.ln2lo = 0x1.ef35793c76730p-45,
+.invln10 = 0x1.bcb7b1526e50ep-2,
+.poly1 = {
+#if LOG10_POLY1_ORDER == 12
+// relative error: 0x1.c04d76cp-63
+// in -0x1p-4 0x1.09p-4 (|log(1+x)| > 0x1p-4 outside the interval)
+-0x1p-1,
+0x1.5555555555577p-2,
+-0x1.ffffffffffdcbp-3,
+0x1.999999995dd0cp-3,
+-0x1.55555556745a7p-3,
+0x1.24924a344de3p-3,
+-0x1.fffffa4423d65p-4,
+0x1.c7184282ad6cap-4,
+-0x1.999eb43b068ffp-4,
+0x1.78182f7afd085p-4,
+-0x1.5521375d145cdp-4,
+#endif
+},
+.poly = {
+#if N == 128 && LOG10_POLY_ORDER == 6
+// relative error: 0x1.926199e8p-56
+// abs error: 0x1.882ff33p-65
+// in -0x1.fp-9 0x1.fp-9
+-0x1.0000000000001p-1,
+0x1.555555551305bp-2,
+-0x1.fffffffeb459p-3,
+0x1.999b324f10111p-3,
+-0x1.55575e506c89fp-3,
+#endif
+},
+/* Algorithm:
+
+	x = 2^k z
+	log(x) = k ln2 + log(c) + log(z/c)
+	log(z/c) = poly(z/c - 1)
+
+where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls
+into the ith one, then table entries are computed as
+
+	tab[i].invc = 1/c
+	tab[i].logc = (double)log(c)
+	tab2[i].chi = (double)c
+	tab2[i].clo = (double)(c - (double)c)
+
+where c is near the center of the subinterval and is chosen by trying +-2^29
+floating point invc candidates around 1/center and selecting one for which
+
+	1) the rounding error in 0x1.8p9 + logc is 0,
+	2) the rounding error in z - chi - clo is < 0x1p-66 and
+	3) the rounding error in (double)log(c) is minimized (< 0x1p-66).
+
+Note: 1) ensures that k*ln2hi + logc can be computed without rounding error,
+2) ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to
+a single rounding error when there is no fast fma for z*invc - 1, 3) ensures
+that logc + poly(z/c - 1) has small error, however near x == 1 when
+|log(x)| < 0x1p-4, this is not enough so that is special cased.  */
+.tab = {
+#if N == 128
+{0x1.734f0c3e0de9fp+0, -0x1.7cc7f79e69000p-2},
+{0x1.713786a2ce91fp+0, -0x1.76feec20d0000p-2},
+{0x1.6f26008fab5a0p+0, -0x1.713e31351e000p-2},
+{0x1.6d1a61f138c7dp+0, -0x1.6b85b38287800p-2},
+{0x1.6b1490bc5b4d1p+0, -0x1.65d5590807800p-2},
+{0x1.69147332f0cbap+0, -0x1.602d076180000p-2},
+{0x1.6719f18224223p+0, -0x1.5a8ca86909000p-2},
+{0x1.6524f99a51ed9p+0, -0x1.54f4356035000p-2},
+{0x1.63356aa8f24c4p+0, -0x1.4f637c36b4000p-2},
+{0x1.614b36b9ddc14p+0, -0x1.49da7fda85000p-2},
+{0x1.5f66452c65c4cp+0, -0x1.445923989a800p-2},
+{0x1.5d867b5912c4fp+0, -0x1.3edf439b0b800p-2},
+{0x1.5babccb5b90dep+0, -0x1.396ce448f7000p-2},
+{0x1.59d61f2d91a78p+0, -0x1.3401e17bda000p-2},
+{0x1.5805612465687p+0, -0x1.2e9e2ef468000p-2},
+{0x1.56397cee76bd3p+0, -0x1.2941b3830e000p-2},
+{0x1.54725e2a77f93p+0, -0x1.23ec58cda8800p-2},
+{0x1.52aff42064583p+0, -0x1.1e9e129279000p-2},
+{0x1.50f22dbb2bddfp+0, -0x1.1956d2b48f800p-2},
+{0x1.4f38f4734ded7p+0, -0x1.141679ab9f800p-2},
+{0x1.4d843cfde2840p+0, -0x1.0edd094ef9800p-2},
+{0x1.4bd3ec078a3c8p+0, -0x1.09aa518db1000p-2},
+{0x1.4a27fc3e0258ap+0, -0x1.047e65263b800p-2},
+{0x1.4880524d48434p+0, -0x1.feb224586f000p-3},
+{0x1.46dce1b192d0bp+0, -0x1.f474a7517b000p-3},
+{0x1.453d9d3391854p+0, -0x1.ea4443d103000p-3},
+{0x1.43a2744b4845ap+0, -0x1.e020d44e9b000p-3},
+{0x1.420b54115f8fbp+0, -0x1.d60a22977f000p-3},
+{0x1.40782da3ef4b1p+0, -0x1.cc00104959000p-3},
+{0x1.3ee8f5d57fe8fp+0, -0x1.c202956891000p-3},
+{0x1.3d5d9a00b4ce9p+0, -0x1.b81178d811000p-3},
+{0x1.3bd60c010c12bp+0, -0x1.ae2c9ccd3d000p-3},
+{0x1.3a5242b75dab8p+0, -0x1.a45402e129000p-3},
+{0x1.38d22cd9fd002p+0, -0x1.9a877681df000p-3},
+{0x1.3755bc5847a1cp+0, -0x1.90c6d69483000p-3},
+{0x1.35dce49ad36e2p+0, -0x1.87120a645c000p-3},
+{0x1.34679984dd440p+0, -0x1.7d68fb4143000p-3},
+{0x1.32f5cceffcb24p+0, -0x1.73cb83c627000p-3},
+{0x1.3187775a10d49p+0, -0x1.6a39a9b376000p-3},
+{0x1.301c8373e3990p+0, -0x1.60b3154b7a000p-3},
+{0x1.2eb4ebb95f841p+0, -0x1.5737d76243000p-3},
+{0x1.2d50a0219a9d1p+0, -0x1.4dc7b8fc23000p-3},
+{0x1.2bef9a8b7fd2ap+0, -0x1.4462c51d20000p-3},
+{0x1.2a91c7a0c1babp+0, -0x1.3b08abc830000p-3},
+{0x1.293726014b530p+0, -0x1.31b996b490000p-3},
+{0x1.27dfa5757a1f5p+0, -0x1.2875490a44000p-3},
+{0x1.268b39b1d3bbfp+0, -0x1.1f3b9f879a000p-3},
+{0x1.2539d838ff5bdp+0, -0x1.160c8252ca000p-3},
+{0x1.23eb7aac9083bp+0, -0x1.0ce7f57f72000p-3},
+{0x1.22a012ba940b6p+0, -0x1.03cdc49fea000p-3},
+{0x1.2157996cc4132p+0, -0x1.f57bdbc4b8000p-4},
+{0x1.201201dd2fc9bp+0, -0x1.e370896404000p-4},
+{0x1.1ecf4494d480bp+0, -0x1.d17983ef94000p-4},
+{0x1.1d8f5528f6569p+0, -0x1.bf9674ed8a000p-4},
+{0x1.1c52311577e7cp+0, -0x1.adc79202f6000p-4},
+{0x1.1b17c74cb26e9p+0, -0x1.9c0c3e7288000p-4},
+{0x1.19e010c2c1ab6p+0, -0x1.8a646b372c000p-4},
+{0x1.18ab07bb670bdp+0, -0x1.78d01b3ac0000p-4},
+{0x1.1778a25efbcb6p+0, -0x1.674f145380000p-4},
+{0x1.1648d354c31dap+0, -0x1.55e0e6d878000p-4},
+{0x1.151b990275fddp+0, -0x1.4485cdea1e000p-4},
+{0x1.13f0ea432d24cp+0, -0x1.333d94d6aa000p-4},
+{0x1.12c8b7210f9dap+0, -0x1.22079f8c56000p-4},
+{0x1.11a3028ecb531p+0, -0x1.10e4698622000p-4},
+{0x1.107fbda8434afp+0, -0x1.ffa6c6ad20000p-5},
+{0x1.0f5ee0f4e6bb3p+0, -0x1.dda8d4a774000p-5},
+{0x1.0e4065d2a9fcep+0, -0x1.bbcece4850000p-5},
+{0x1.0d244632ca521p+0, -0x1.9a1894012c000p-5},
+{0x1.0c0a77ce2981ap+0, -0x1.788583302c000p-5},
+{0x1.0af2f83c636d1p+0, -0x1.5715e67d68000p-5},
+{0x1.09ddb98a01339p+0, -0x1.35c8a49658000p-5},
+{0x1.08cabaf52e7dfp+0, -0x1.149e364154000p-5},
+{0x1.07b9f2f4e28fbp+0, -0x1.e72c082eb8000p-6},
+{0x1.06ab58c358f19p+0, -0x1.a55f152528000p-6},
+{0x1.059eea5ecf92cp+0, -0x1.63d62cf818000p-6},
+{0x1.04949cdd12c90p+0, -0x1.228fb8caa0000p-6},
+{0x1.038c6c6f0ada9p+0, -0x1.c317b20f90000p-7},
+{0x1.02865137932a9p+0, -0x1.419355daa0000p-7},
+{0x1.0182427ea7348p+0, -0x1.81203c2ec0000p-8},
+{0x1.008040614b195p+0, -0x1.0040979240000p-9},
+{0x1.fe01ff726fa1ap-1, 0x1.feff384900000p-9},
+{0x1.fa11cc261ea74p-1, 0x1.7dc41353d0000p-7},
+{0x1.f6310b081992ep-1, 0x1.3cea3c4c28000p-6},
+{0x1.f25f63ceeadcdp-1, 0x1.b9fc114890000p-6},
+{0x1.ee9c8039113e7p-1, 0x1.1b0d8ce110000p-5},
+{0x1.eae8078cbb1abp-1, 0x1.58a5bd001c000p-5},
+{0x1.e741aa29d0c9bp-1, 0x1.95c8340d88000p-5},
+{0x1.e3a91830a99b5p-1, 0x1.d276aef578000p-5},
+{0x1.e01e009609a56p-1, 0x1.07598e598c000p-4},
+{0x1.dca01e577bb98p-1, 0x1.253f5e30d2000p-4},
+{0x1.d92f20b7c9103p-1, 0x1.42edd8b380000p-4},
+{0x1.d5cac66fb5ccep-1, 0x1.606598757c000p-4},
+{0x1.d272caa5ede9dp-1, 0x1.7da76356a0000p-4},
+{0x1.cf26e3e6b2ccdp-1, 0x1.9ab434e1c6000p-4},
+{0x1.cbe6da2a77902p-1, 0x1.b78c7bb0d6000p-4},
+{0x1.c8b266d37086dp-1, 0x1.d431332e72000p-4},
+{0x1.c5894bd5d5804p-1, 0x1.f0a3171de6000p-4},
+{0x1.c26b533bb9f8cp-1, 0x1.067152b914000p-3},
+{0x1.bf583eeece73fp-1, 0x1.147858292b000p-3},
+{0x1.bc4fd75db96c1p-1, 0x1.2266ecdca3000p-3},
+{0x1.b951e0c864a28p-1, 0x1.303d7a6c55000p-3},
+{0x1.b65e2c5ef3e2cp-1, 0x1.3dfc33c331000p-3},
+{0x1.b374867c9888bp-1, 0x1.4ba366b7a8000p-3},
+{0x1.b094b211d304ap-1, 0x1.5933928d1f000p-3},
+{0x1.adbe885f2ef7ep-1, 0x1.66acd2418f000p-3},
+{0x1.aaf1d31603da2p-1, 0x1.740f8ec669000p-3},
+{0x1.a82e63fd358a7p-1, 0x1.815c0f51af000p-3},
+{0x1.a5740ef09738bp-1, 0x1.8e92954f68000p-3},
+{0x1.a2c2a90ab4b27p-1, 0x1.9bb3602f84000p-3},
+{0x1.a01a01393f2d1p-1, 0x1.a8bed1c2c0000p-3},
+{0x1.9d79f24db3c1bp-1, 0x1.b5b515c01d000p-3},
+{0x1.9ae2505c7b190p-1, 0x1.c2967ccbcc000p-3},
+{0x1.9852ef297ce2fp-1, 0x1.cf635d5486000p-3},
+{0x1.95cbaeea44b75p-1, 0x1.dc1bd3446c000p-3},
+{0x1.934c69de74838p-1, 0x1.e8c01b8cfe000p-3},
+{0x1.90d4f2f6752e6p-1, 0x1.f5509c0179000p-3},
+{0x1.8e6528effd79dp-1, 0x1.00e6c121fb800p-2},
+{0x1.8bfce9fcc007cp-1, 0x1.071b80e93d000p-2},
+{0x1.899c0dabec30ep-1, 0x1.0d46b9e867000p-2},
+{0x1.87427aa2317fbp-1, 0x1.13687334bd000p-2},
+{0x1.84f00acb39a08p-1, 0x1.1980d67234800p-2},
+{0x1.82a49e8653e55p-1, 0x1.1f8ffe0cc8000p-2},
+{0x1.8060195f40260p-1, 0x1.2595fd7636800p-2},
+{0x1.7e22563e0a329p-1, 0x1.2b9300914a800p-2},
+{0x1.7beb377dcb5adp-1, 0x1.3187210436000p-2},
+{0x1.79baa679725c2p-1, 0x1.377266dec1800p-2},
+{0x1.77907f2170657p-1, 0x1.3d54ffbaf3000p-2},
+{0x1.756cadbd6130cp-1, 0x1.432eee32fe000p-2},
+#endif
+},
+#if !HAVE_FAST_FMA
+.tab2 = {
+#if N == 128
+{0x1.61000014fb66bp-1, 0x1.e026c91425b3cp-56},
+{0x1.63000034db495p-1, 0x1.dbfea48005d41p-55},
+{0x1.650000d94d478p-1, 0x1.e7fa786d6a5b7p-55},
+{0x1.67000074e6fadp-1, 0x1.1fcea6b54254cp-57},
+{0x1.68ffffedf0faep-1, -0x1.c7e274c590efdp-56},
+{0x1.6b0000763c5bcp-1, -0x1.ac16848dcda01p-55},
+{0x1.6d0001e5cc1f6p-1, 0x1.33f1c9d499311p-55},
+{0x1.6efffeb05f63ep-1, -0x1.e80041ae22d53p-56},
+{0x1.710000e86978p-1, 0x1.bff6671097952p-56},
+{0x1.72ffffc67e912p-1, 0x1.c00e226bd8724p-55},
+{0x1.74fffdf81116ap-1, -0x1.e02916ef101d2p-57},
+{0x1.770000f679c9p-1, -0x1.7fc71cd549c74p-57},
+{0x1.78ffffa7ec835p-1, 0x1.1bec19ef50483p-55},
+{0x1.7affffe20c2e6p-1, -0x1.07e1729cc6465p-56},
+{0x1.7cfffed3fc9p-1, -0x1.08072087b8b1cp-55},
+{0x1.7efffe9261a76p-1, 0x1.dc0286d9df9aep-55},
+{0x1.81000049ca3e8p-1, 0x1.97fd251e54c33p-55},
+{0x1.8300017932c8fp-1, -0x1.afee9b630f381p-55},
+{0x1.850000633739cp-1, 0x1.9bfbf6b6535bcp-55},
+{0x1.87000204289c6p-1, -0x1.bbf65f3117b75p-55},
+{0x1.88fffebf57904p-1, -0x1.9006ea23dcb57p-55},
+{0x1.8b00022bc04dfp-1, -0x1.d00df38e04b0ap-56},
+{0x1.8cfffe50c1b8ap-1, -0x1.8007146ff9f05p-55},
+{0x1.8effffc918e43p-1, 0x1.3817bd07a7038p-55},
+{0x1.910001efa5fc7p-1, 0x1.93e9176dfb403p-55},
+{0x1.9300013467bb9p-1, 0x1.f804e4b980276p-56},
+{0x1.94fffe6ee076fp-1, -0x1.f7ef0d9ff622ep-55},
+{0x1.96fffde3c12d1p-1, -0x1.082aa962638bap-56},
+{0x1.98ffff4458a0dp-1, -0x1.7801b9164a8efp-55},
+{0x1.9afffdd982e3ep-1, -0x1.740e08a5a9337p-55},
+{0x1.9cfffed49fb66p-1, 0x1.fce08c19bep-60},
+{0x1.9f00020f19c51p-1, -0x1.a3faa27885b0ap-55},
+{0x1.a10001145b006p-1, 0x1.4ff489958da56p-56},
+{0x1.a300007bbf6fap-1, 0x1.cbeab8a2b6d18p-55},
+{0x1.a500010971d79p-1, 0x1.8fecadd78793p-55},
+{0x1.a70001df52e48p-1, -0x1.f41763dd8abdbp-55},
+{0x1.a90001c593352p-1, -0x1.ebf0284c27612p-55},
+{0x1.ab0002a4f3e4bp-1, -0x1.9fd043cff3f5fp-57},
+{0x1.acfffd7ae1ed1p-1, -0x1.23ee7129070b4p-55},
+{0x1.aefffee510478p-1, 0x1.a063ee00edea3p-57},
+{0x1.b0fffdb650d5bp-1, 0x1.a06c8381f0ab9p-58},
+{0x1.b2ffffeaaca57p-1, -0x1.9011e74233c1dp-56},
+{0x1.b4fffd995badcp-1, -0x1.9ff1068862a9fp-56},
+{0x1.b7000249e659cp-1, 0x1.aff45d0864f3ep-55},
+{0x1.b8ffff987164p-1, 0x1.cfe7796c2c3f9p-56},
+{0x1.bafffd204cb4fp-1, -0x1.3ff27eef22bc4p-57},
+{0x1.bcfffd2415c45p-1, -0x1.cffb7ee3bea21p-57},
+{0x1.beffff86309dfp-1, -0x1.14103972e0b5cp-55},
+{0x1.c0fffe1b57653p-1, 0x1.bc16494b76a19p-55},
+{0x1.c2ffff1fa57e3p-1, -0x1.4feef8d30c6edp-57},
+{0x1.c4fffdcbfe424p-1, -0x1.43f68bcec4775p-55},
+{0x1.c6fffed54b9f7p-1, 0x1.47ea3f053e0ecp-55},
+{0x1.c8fffeb998fd5p-1, 0x1.383068df992f1p-56},
+{0x1.cb0002125219ap-1, -0x1.8fd8e64180e04p-57},
+{0x1.ccfffdd94469cp-1, 0x1.e7ebe1cc7ea72p-55},
+{0x1.cefffeafdc476p-1, 0x1.ebe39ad9f88fep-55},
+{0x1.d1000169af82bp-1, 0x1.57d91a8b95a71p-56},
+{0x1.d30000d0ff71dp-1, 0x1.9c1906970c7dap-55},
+{0x1.d4fffea790fc4p-1, -0x1.80e37c558fe0cp-58},
+{0x1.d70002edc87e5p-1, -0x1.f80d64dc10f44p-56},
+{0x1.d900021dc82aap-1, -0x1.47c8f94fd5c5cp-56},
+{0x1.dafffd86b0283p-1, 0x1.c7f1dc521617ep-55},
+{0x1.dd000296c4739p-1, 0x1.8019eb2ffb153p-55},
+{0x1.defffe54490f5p-1, 0x1.e00d2c652cc89p-57},
+{0x1.e0fffcdabf694p-1, -0x1.f8340202d69d2p-56},
+{0x1.e2fffdb52c8ddp-1, 0x1.b00c1ca1b0864p-56},
+{0x1.e4ffff24216efp-1, 0x1.2ffa8b094ab51p-56},
+{0x1.e6fffe88a5e11p-1, -0x1.7f673b1efbe59p-58},
+{0x1.e9000119eff0dp-1, -0x1.4808d5e0bc801p-55},
+{0x1.eafffdfa51744p-1, 0x1.80006d54320b5p-56},
+{0x1.ed0001a127fa1p-1, -0x1.002f860565c92p-58},
+{0x1.ef00007babcc4p-1, -0x1.540445d35e611p-55},
+{0x1.f0ffff57a8d02p-1, -0x1.ffb3139ef9105p-59},
+{0x1.f30001ee58ac7p-1, 0x1.a81acf2731155p-55},
+{0x1.f4ffff5823494p-1, 0x1.a3f41d4d7c743p-55},
+{0x1.f6ffffca94c6bp-1, -0x1.202f41c987875p-57},
+{0x1.f8fffe1f9c441p-1, 0x1.77dd1f477e74bp-56},
+{0x1.fafffd2e0e37ep-1, -0x1.f01199a7ca331p-57},
+{0x1.fd0001c77e49ep-1, 0x1.181ee4bceacb1p-56},
+{0x1.feffff7e0c331p-1, -0x1.e05370170875ap-57},
+{0x1.00ffff465606ep+0, -0x1.a7ead491c0adap-55},
+{0x1.02ffff3867a58p+0, -0x1.77f69c3fcb2ep-54},
+{0x1.04ffffdfc0d17p+0, 0x1.7bffe34cb945bp-54},
+{0x1.0700003cd4d82p+0, 0x1.20083c0e456cbp-55},
+{0x1.08ffff9f2cbe8p+0, -0x1.dffdfbe37751ap-57},
+{0x1.0b000010cda65p+0, -0x1.13f7faee626ebp-54},
+{0x1.0d00001a4d338p+0, 0x1.07dfa79489ff7p-55},
+{0x1.0effffadafdfdp+0, -0x1.7040570d66bcp-56},
+{0x1.110000bbafd96p+0, 0x1.e80d4846d0b62p-55},
+{0x1.12ffffae5f45dp+0, 0x1.dbffa64fd36efp-54},
+{0x1.150000dd59ad9p+0, 0x1.a0077701250aep-54},
+{0x1.170000f21559ap+0, 0x1.dfdf9e2e3deeep-55},
+{0x1.18ffffc275426p+0, 0x1.10030dc3b7273p-54},
+{0x1.1b000123d3c59p+0, 0x1.97f7980030188p-54},
+{0x1.1cffff8299eb7p+0, -0x1.5f932ab9f8c67p-57},
+{0x1.1effff48ad4p+0, 0x1.37fbf9da75bebp-54},
+{0x1.210000c8b86a4p+0, 0x1.f806b91fd5b22p-54},
+{0x1.2300003854303p+0, 0x1.3ffc2eb9fbf33p-54},
+{0x1.24fffffbcf684p+0, 0x1.601e77e2e2e72p-56},
+{0x1.26ffff52921d9p+0, 0x1.ffcbb767f0c61p-56},
+{0x1.2900014933a3cp+0, -0x1.202ca3c02412bp-56},
+{0x1.2b00014556313p+0, -0x1.2808233f21f02p-54},
+{0x1.2cfffebfe523bp+0, -0x1.8ff7e384fdcf2p-55},
+{0x1.2f0000bb8ad96p+0, -0x1.5ff51503041c5p-55},
+{0x1.30ffffb7ae2afp+0, -0x1.10071885e289dp-55},
+{0x1.32ffffeac5f7fp+0, -0x1.1ff5d3fb7b715p-54},
+{0x1.350000ca66756p+0, 0x1.57f82228b82bdp-54},
+{0x1.3700011fbf721p+0, 0x1.000bac40dd5ccp-55},
+{0x1.38ffff9592fb9p+0, -0x1.43f9d2db2a751p-54},
+{0x1.3b00004ddd242p+0, 0x1.57f6b707638e1p-55},
+{0x1.3cffff5b2c957p+0, 0x1.a023a10bf1231p-56},
+{0x1.3efffeab0b418p+0, 0x1.87f6d66b152bp-54},
+{0x1.410001532aff4p+0, 0x1.7f8375f198524p-57},
+{0x1.4300017478b29p+0, 0x1.301e672dc5143p-55},
+{0x1.44fffe795b463p+0, 0x1.9ff69b8b2895ap-55},
+{0x1.46fffe80475ep+0, -0x1.5c0b19bc2f254p-54},
+{0x1.48fffef6fc1e7p+0, 0x1.b4009f23a2a72p-54},
+{0x1.4afffe5bea704p+0, -0x1.4ffb7bf0d7d45p-54},
+{0x1.4d000171027dep+0, -0x1.9c06471dc6a3dp-54},
+{0x1.4f0000ff03ee2p+0, 0x1.77f890b85531cp-54},
+{0x1.5100012dc4bd1p+0, 0x1.004657166a436p-57},
+{0x1.530001605277ap+0, -0x1.6bfcece233209p-54},
+{0x1.54fffecdb704cp+0, -0x1.902720505a1d7p-55},
+{0x1.56fffef5f54a9p+0, 0x1.bbfe60ec96412p-54},
+{0x1.5900017e61012p+0, 0x1.87ec581afef9p-55},
+{0x1.5b00003c93e92p+0, -0x1.f41080abf0ccp-54},
+{0x1.5d0001d4919bcp+0, -0x1.8812afb254729p-54},
+{0x1.5efffe7b87a89p+0, -0x1.47eb780ed6904p-54},
+#endif
+},
+#endif /* !HAVE_FAST_FMA */
+};
diff --git a/pl/math/log10f.c b/pl/math/log10f.c
new file mode 100644
index 0000000..5c80008
--- /dev/null
+++ b/pl/math/log10f.c
@@ -0,0 +1,97 @@
+/*
+ * Single-precision log10 function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include <math.h>
+#include <stdint.h>
+
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+/* Data associated to logf:
+
+   LOGF_TABLE_BITS = 4
+   LOGF_POLY_ORDER = 4
+
+   ULP error: 0.818 (nearest rounding.)
+   Relative error: 1.957 * 2^-26 (before rounding.).  */
+
+#define T __logf_data.tab
+#define A __logf_data.poly
+#define Ln2 __logf_data.ln2
+#define InvLn10 __logf_data.invln10
+#define N (1 << LOGF_TABLE_BITS)
+#define OFF 0x3f330000
+
+/* This naive implementation of log10f mimics that of log
+   then simply scales the result by 1/log(10) to switch from base e to
+   base 10. Hence, most computations are carried out in double precision.
+   Scaling before rounding to single precision is both faster and more accurate.
+
+   ULP error: 0.797 ulp (nearest rounding.).  */
+float
+log10f (float x)
+{
+  /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
+  double_t z, r, r2, y, y0, invc, logc;
+  uint32_t ix, iz, tmp;
+  int k, i;
+
+  ix = asuint (x);
+#if WANT_ROUNDING
+  /* Fix sign of zero with downward rounding when x==1.  */
+  if (unlikely (ix == 0x3f800000))
+    return 0;
+#endif
+  if (unlikely (ix - 0x00800000 >= 0x7f800000 - 0x00800000))
+    {
+      /* x < 0x1p-126 or inf or nan.  */
+      if (ix * 2 == 0)
+	return __math_divzerof (1);
+      if (ix == 0x7f800000) /* log(inf) == inf.  */
+	return x;
+      if ((ix & 0x80000000) || ix * 2 >= 0xff000000)
+	return __math_invalidf (x);
+      /* x is subnormal, normalize it.  */
+      ix = asuint (x * 0x1p23f);
+      ix -= 23 << 23;
+    }
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF] and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  tmp = ix - OFF;
+  i = (tmp >> (23 - LOGF_TABLE_BITS)) % N;
+  k = (int32_t) tmp >> 23; /* arithmetic shift.  */
+  iz = ix - (tmp & 0xff800000);
+  invc = T[i].invc;
+  logc = T[i].logc;
+  z = (double_t) asfloat (iz);
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+  r = z * invc - 1;
+  y0 = logc + (double_t) k * Ln2;
+
+  /* Pipelined polynomial evaluation to approximate log1p(r).  */
+  r2 = r * r;
+  y = A[1] * r + A[2];
+  y = A[0] * r2 + y;
+  y = y * r2 + (y0 + r);
+
+  /* Multiply by 1/log(10).  */
+  y = y * InvLn10;
+
+  return eval_as_float (y);
+}
+
+PL_SIG (S, F, 1, log10, 0.01, 11.1)
+PL_TEST_ULP (log10f, 0.30)
+PL_TEST_INTERVAL (log10f, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (log10f, 0x1p-127, 0x1p-26, 50000)
+PL_TEST_INTERVAL (log10f, 0x1p-26, 0x1p3, 50000)
+PL_TEST_INTERVAL (log10f, 0x1p-4, 0x1p4, 50000)
+PL_TEST_INTERVAL (log10f, 0, inf, 50000)
diff --git a/pl/math/log1p_2u.c b/pl/math/log1p_2u.c
new file mode 100644
index 0000000..23c8ed4
--- /dev/null
+++ b/pl/math/log1p_2u.c
@@ -0,0 +1,136 @@
+/*
+ * Double-precision log(1+x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "estrin.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define Ln2Hi 0x1.62e42fefa3800p-1
+#define Ln2Lo 0x1.ef35793c76730p-45
+#define HfRt2Top 0x3fe6a09e /* top32(asuint64(sqrt(2)/2)).  */
+#define OneMHfRt2Top                                                           \
+  0x00095f62 /* top32(asuint64(1)) - top32(asuint64(sqrt(2)/2)).  */
+#define OneTop12 0x3ff
+#define BottomMask 0xffffffff
+#define OneMHfRt2 0x3fd2bec333018866
+#define Rt2MOne 0x3fda827999fcef32
+#define AbsMask 0x7fffffffffffffff
+#define ExpM63 0x3c00
+#define C(i) __log1p_data.coeffs[i]
+
+static inline double
+eval_poly (double f)
+{
+  double f2 = f * f;
+  double f4 = f2 * f2;
+  double f8 = f4 * f4;
+  return ESTRIN_18 (f, f2, f4, f8, f8 * f8, C);
+}
+
+/* log1p approximation using polynomial on reduced interval. Largest
+   observed errors are near the lower boundary of the region where k
+   is 0.
+   Maximum measured error: 1.75ULP.
+   log1p(-0x1.2e1aea97b3e5cp-2) got -0x1.65fb8659a2f9p-2
+			       want -0x1.65fb8659a2f92p-2.  */
+double
+log1p (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t ia = ix & AbsMask;
+  uint32_t ia16 = ia >> 48;
+
+  /* Handle special cases first.  */
+  if (unlikely (ia16 >= 0x7ff0 || ix >= 0xbff0000000000000
+		|| ix == 0x8000000000000000))
+    {
+      if (ix == 0x8000000000000000 || ix == 0x7ff0000000000000)
+	{
+	  /* x ==  -0 => log1p(x) =  -0.
+	     x == Inf => log1p(x) = Inf.  */
+	  return x;
+	}
+      if (ix == 0xbff0000000000000)
+	{
+	  /* x == -1 => log1p(x) = -Inf.  */
+	  return __math_divzero (-1);
+	  ;
+	}
+      if (ia16 >= 0x7ff0)
+	{
+	  /* x == +/-NaN => log1p(x) = NaN.  */
+	  return __math_invalid (asdouble (ia));
+	}
+      /* x  <      -1 => log1p(x) =  NaN.
+	 x ==    -Inf => log1p(x) =  NaN.  */
+      return __math_invalid (x);
+    }
+
+  /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f
+			   is in [sqrt(2)/2, sqrt(2)]):
+     log1p(x) = k*log(2) + log1p(f).
+
+     f may not be representable exactly, so we need a correction term:
+     let m = round(1 + x), c = (1 + x) - m.
+     c << m: at very small x, log1p(x) ~ x, hence:
+     log(1+x) - log(m) ~ c/m.
+
+     We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m.  */
+
+  uint64_t sign = ix & ~AbsMask;
+  if (ia <= OneMHfRt2 || (!sign && ia <= Rt2MOne))
+    {
+      if (unlikely (ia16 <= ExpM63))
+	{
+	  /* If exponent of x <= -63 then shortcut the polynomial and avoid
+	     underflow by just returning x, which is exactly rounded in this
+	     region.  */
+	  return x;
+	}
+      /* If x is in [sqrt(2)/2 - 1, sqrt(2) - 1] then we can shortcut all the
+	 logic below, as k = 0 and f = x and therefore representable exactly.
+	 All we need is to return the polynomial.  */
+      return fma (x, eval_poly (x) * x, x);
+    }
+
+  /* Obtain correctly scaled k by manipulation in the exponent.  */
+  double m = x + 1;
+  uint64_t mi = asuint64 (m);
+  uint32_t u = (mi >> 32) + OneMHfRt2Top;
+  int32_t k = (int32_t) (u >> 20) - OneTop12;
+
+  /* Correction term c/m.  */
+  double cm = (x - (m - 1)) / m;
+
+  /* Reduce x to f in [sqrt(2)/2, sqrt(2)].  */
+  uint32_t utop = (u & 0x000fffff) + HfRt2Top;
+  uint64_t u_red = ((uint64_t) utop << 32) | (mi & BottomMask);
+  double f = asdouble (u_red) - 1;
+
+  /* Approximate log1p(x) on the reduced input using a polynomial. Because
+     log1p(0)=0 we choose an approximation of the form:
+	x + C0*x^2 + C1*x^3 + C2x^4 + ...
+     Hence approximation has the form f + f^2 * P(f)
+	where P(x) = C0 + C1*x + C2x^2 + ...  */
+  double p = fma (f, eval_poly (f) * f, f);
+
+  double kd = k;
+  double y = fma (Ln2Lo, kd, cm);
+  return y + fma (Ln2Hi, kd, p);
+}
+
+PL_SIG (S, D, 1, log1p, -0.9, 10.0)
+PL_TEST_ULP (log1p, 1.26)
+PL_TEST_INTERVAL (log1p, -10.0, 10.0, 10000)
+PL_TEST_INTERVAL (log1p, 0.0, 0x1p-23, 50000)
+PL_TEST_INTERVAL (log1p, 0x1p-23, 0.001, 50000)
+PL_TEST_INTERVAL (log1p, 0.001, 1.0, 50000)
+PL_TEST_INTERVAL (log1p, 0.0, -0x1p-23, 50000)
+PL_TEST_INTERVAL (log1p, -0x1p-23, -0.001, 50000)
+PL_TEST_INTERVAL (log1p, -0.001, -1.0, 50000)
+PL_TEST_INTERVAL (log1p, -1.0, inf, 5000)
diff --git a/pl/math/log1p_data.c b/pl/math/log1p_data.c
new file mode 100644
index 0000000..6168a0c
--- /dev/null
+++ b/pl/math/log1p_data.c
@@ -0,0 +1,19 @@
+/*
+ * Data used in double-precision log(1+x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Polynomial coefficients generated using Remez algorithm, see
+   log1p.sollya for details.  */
+const struct log1p_data __log1p_data = {
+  .coeffs = {-0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2,
+	     0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3,
+	     -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4,
+	     0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4,
+	     -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5,
+	     0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4,
+	     -0x1.cfa7385bdb37ep-6}};
diff --git a/pl/math/log1pf_2u1.c b/pl/math/log1pf_2u1.c
new file mode 100644
index 0000000..fcfd05a
--- /dev/null
+++ b/pl/math/log1pf_2u1.c
@@ -0,0 +1,165 @@
+/*
+ * Single-precision log(1+x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "hornerf.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define Ln2 (0x1.62e43p-1f)
+#define SignMask (0x80000000)
+
+/* Biased exponent of the largest float m for which m^8 underflows.  */
+#define M8UFLOW_BOUND_BEXP 112
+/* Biased exponent of the largest float for which we just return x.  */
+#define TINY_BOUND_BEXP 103
+
+#define C(i) __log1pf_data.coeffs[i]
+
+static inline float
+eval_poly (float m, uint32_t e)
+{
+#ifdef LOG1PF_2U5
+
+  /* 2.5 ulp variant. Approximate log(1+m) on [-0.25, 0.5] using
+     slightly modified Estrin scheme (no x^0 term, and x term is just x).  */
+  float p_12 = fmaf (m, C (1), C (0));
+  float p_34 = fmaf (m, C (3), C (2));
+  float p_56 = fmaf (m, C (5), C (4));
+  float p_78 = fmaf (m, C (7), C (6));
+
+  float m2 = m * m;
+  float p_02 = fmaf (m2, p_12, m);
+  float p_36 = fmaf (m2, p_56, p_34);
+  float p_79 = fmaf (m2, C (8), p_78);
+
+  float m4 = m2 * m2;
+  float p_06 = fmaf (m4, p_36, p_02);
+
+  if (unlikely (e < M8UFLOW_BOUND_BEXP))
+    return p_06;
+
+  float m8 = m4 * m4;
+  return fmaf (m8, p_79, p_06);
+
+#elif defined(LOG1PF_1U3)
+
+  /* 1.3 ulp variant. Approximate log(1+m) on [-0.25, 0.5] using Horner
+     scheme. Our polynomial approximation for log1p has the form
+     x + C1 * x^2 + C2 * x^3 + C3 * x^4 + ...
+     Hence approximation has the form m + m^2 * P(m)
+       where P(x) = C1 + C2 * x + C3 * x^2 + ... .  */
+  return fmaf (m, m * HORNER_8 (m, C), m);
+
+#else
+#error No log1pf approximation exists with the requested precision. Options are 13 or 25.
+#endif
+}
+
+static inline uint32_t
+biased_exponent (uint32_t ix)
+{
+  return (ix & 0x7f800000) >> 23;
+}
+
+/* log1pf approximation using polynomial on reduced interval. Worst-case error
+   when using Estrin is roughly 2.02 ULP:
+   log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3.  */
+float
+log1pf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t ia = ix & ~SignMask;
+  uint32_t ia12 = ia >> 20;
+  uint32_t e = biased_exponent (ix);
+
+  /* Handle special cases first.  */
+  if (unlikely (ia12 >= 0x7f8 || ix >= 0xbf800000 || ix == 0x80000000
+		|| e <= TINY_BOUND_BEXP))
+    {
+      if (ix == 0xff800000)
+	{
+	  /* x == -Inf => log1pf(x) =  NaN.  */
+	  return NAN;
+	}
+      if ((ix == 0x7f800000 || e <= TINY_BOUND_BEXP) && ia12 <= 0x7f8)
+	{
+	  /* |x| < TinyBound => log1p(x)  =  x.
+	      x ==       Inf => log1pf(x) = Inf.  */
+	  return x;
+	}
+      if (ix == 0xbf800000)
+	{
+	  /* x == -1.0 => log1pf(x) = -Inf.  */
+	  return __math_divzerof (-1);
+	}
+      if (ia12 >= 0x7f8)
+	{
+	  /* x == +/-NaN => log1pf(x) = NaN.  */
+	  return __math_invalidf (asfloat (ia));
+	}
+      /* x <    -1.0 => log1pf(x) = NaN.  */
+      return __math_invalidf (x);
+    }
+
+  /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+			   is in [-0.25, 0.5]):
+     log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+
+     We approximate log1p(m) with a polynomial, then scale by
+     k*log(2). Instead of doing this directly, we use an intermediate
+     scale factor s = 4*k*log(2) to ensure the scale is representable
+     as a normalised fp32 number.  */
+
+  if (ix <= 0x3f000000 || ia <= 0x3e800000)
+    {
+      /* If x is in [-0.25, 0.5] then we can shortcut all the logic
+	 below, as k = 0 and m = x.  All we need is to return the
+	 polynomial.  */
+      return eval_poly (x, e);
+    }
+
+  float m = x + 1.0f;
+
+  /* k is used scale the input. 0x3f400000 is chosen as we are trying to
+     reduce x to the range [-0.25, 0.5]. Inside this range, k is 0.
+     Outside this range, if k is reinterpreted as (NOT CONVERTED TO) float:
+	 let k = sign * 2^p      where sign = -1 if x < 0
+					       1 otherwise
+	 and p is a negative integer whose magnitude increases with the
+	 magnitude of x.  */
+  int k = (asuint (m) - 0x3f400000) & 0xff800000;
+
+  /* By using integer arithmetic, we obtain the necessary scaling by
+     subtracting the unbiased exponent of k from the exponent of x.  */
+  float m_scale = asfloat (asuint (x) - k);
+
+  /* Scale up to ensure that the scale factor is representable as normalised
+     fp32 number (s in [2**-126,2**26]), and scale m down accordingly.  */
+  float s = asfloat (asuint (4.0f) - k);
+  m_scale = m_scale + fmaf (0.25f, s, -1.0f);
+
+  float p = eval_poly (m_scale, biased_exponent (asuint (m_scale)));
+
+  /* The scale factor to be applied back at the end - by multiplying float(k)
+     by 2^-23 we get the unbiased exponent of k.  */
+  float scale_back = (float) k * 0x1.0p-23f;
+
+  /* Apply the scaling back.  */
+  return fmaf (scale_back, Ln2, p);
+}
+
+PL_SIG (S, F, 1, log1p, -0.9, 10.0)
+PL_TEST_ULP (log1pf, 1.52)
+PL_TEST_INTERVAL (log1pf, -10.0, 10.0, 10000)
+PL_TEST_INTERVAL (log1pf, 0.0, 0x1p-23, 50000)
+PL_TEST_INTERVAL (log1pf, 0x1p-23, 0.001, 50000)
+PL_TEST_INTERVAL (log1pf, 0.001, 1.0, 50000)
+PL_TEST_INTERVAL (log1pf, 0.0, -0x1p-23, 50000)
+PL_TEST_INTERVAL (log1pf, -0x1p-23, -0.001, 50000)
+PL_TEST_INTERVAL (log1pf, -0.001, -1.0, 50000)
+PL_TEST_INTERVAL (log1pf, -1.0, inf, 5000)
diff --git a/pl/math/log1pf_data.c b/pl/math/log1pf_data.c
new file mode 100644
index 0000000..8c92d57
--- /dev/null
+++ b/pl/math/log1pf_data.c
@@ -0,0 +1,14 @@
+/*
+ * Data used in single-precision log1p(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "math_config.h"
+
+/* Polynomial coefficients generated using floating-point minimax
+   algorithm, see tools/log1pf.sollya for details.  */
+const struct log1pf_data __log1pf_data
+  = {.coeffs = {-0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
+		-0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, 0x1.abcb6p-4f,
+		-0x1.6f0d5ep-5f}};
diff --git a/pl/math/log_data.c b/pl/math/log_data.c
new file mode 100644
index 0000000..34715e5
--- /dev/null
+++ b/pl/math/log_data.c
@@ -0,0 +1,511 @@
+/*
+ * Data for log.
+ *
+ * Copyright (c) 2018-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define N (1 << LOG_TABLE_BITS)
+
+const struct log_data __log_data = {
+.ln2hi = 0x1.62e42fefa3800p-1,
+.ln2lo = 0x1.ef35793c76730p-45,
+.poly1 = {
+#if LOG_POLY1_ORDER == 10
+// relative error: 0x1.32eccc6p-62
+// in -0x1p-5 0x1.1p-5 (|log(1+x)| > 0x1p-5 outside this interval)
+-0x1p-1,
+0x1.55555555554e5p-2,
+-0x1.0000000000af2p-2,
+0x1.9999999bbe436p-3,
+-0x1.55555537f9cdep-3,
+0x1.24922fc8127cfp-3,
+-0x1.0000b7d6bb612p-3,
+0x1.c806ee1ddbcafp-4,
+-0x1.972335a9c2d6ep-4,
+#elif LOG_POLY1_ORDER == 11
+// relative error: 0x1.52c8b708p-68
+// in -0x1p-5 0x1.1p-5 (|log(1+x)| > 0x1p-5 outside this interval)
+-0x1p-1,
+0x1.5555555555555p-2,
+-0x1.ffffffffffea9p-3,
+0x1.999999999c4d4p-3,
+-0x1.55555557f5541p-3,
+0x1.249248fbe33e4p-3,
+-0x1.ffffc9a3c825bp-4,
+0x1.c71e1f204435dp-4,
+-0x1.9a7f26377d06ep-4,
+0x1.71c30cf8f7364p-4,
+#elif LOG_POLY1_ORDER == 12
+// relative error: 0x1.c04d76cp-63
+// in -0x1p-4 0x1.09p-4 (|log(1+x)| > 0x1p-4 outside the interval)
+-0x1p-1,
+0x1.5555555555577p-2,
+-0x1.ffffffffffdcbp-3,
+0x1.999999995dd0cp-3,
+-0x1.55555556745a7p-3,
+0x1.24924a344de3p-3,
+-0x1.fffffa4423d65p-4,
+0x1.c7184282ad6cap-4,
+-0x1.999eb43b068ffp-4,
+0x1.78182f7afd085p-4,
+-0x1.5521375d145cdp-4,
+#endif
+},
+.poly = {
+#if N == 64 && LOG_POLY_ORDER == 7
+// relative error: 0x1.906eb8ap-58
+// abs error: 0x1.d2cad5a8p-67
+// in -0x1.fp-8 0x1.fp-8
+-0x1.0000000000027p-1,
+0x1.555555555556ap-2,
+-0x1.fffffff0440bap-3,
+0x1.99999991906c3p-3,
+-0x1.555c8d7e8201ep-3,
+0x1.24978c59151fap-3,
+#elif N == 128 && LOG_POLY_ORDER == 6
+// relative error: 0x1.926199e8p-56
+// abs error: 0x1.882ff33p-65
+// in -0x1.fp-9 0x1.fp-9
+-0x1.0000000000001p-1,
+0x1.555555551305bp-2,
+-0x1.fffffffeb459p-3,
+0x1.999b324f10111p-3,
+-0x1.55575e506c89fp-3,
+#elif N == 128 && LOG_POLY_ORDER == 7
+// relative error: 0x1.649fc4bp-64
+// abs error: 0x1.c3b5769p-74
+// in -0x1.fp-9 0x1.fp-9
+-0x1.0000000000001p-1,
+0x1.5555555555556p-2,
+-0x1.fffffffea1a8p-3,
+0x1.99999998e9139p-3,
+-0x1.555776801b968p-3,
+0x1.2493c29331a5cp-3,
+#endif
+},
+/* Algorithm:
+
+	x = 2^k z
+	log(x) = k ln2 + log(c) + log(z/c)
+	log(z/c) = poly(z/c - 1)
+
+where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls
+into the ith one, then table entries are computed as
+
+	tab[i].invc = 1/c
+	tab[i].logc = (double)log(c)
+	tab2[i].chi = (double)c
+	tab2[i].clo = (double)(c - (double)c)
+
+where c is near the center of the subinterval and is chosen by trying +-2^29
+floating point invc candidates around 1/center and selecting one for which
+
+	1) the rounding error in 0x1.8p9 + logc is 0,
+	2) the rounding error in z - chi - clo is < 0x1p-66 and
+	3) the rounding error in (double)log(c) is minimized (< 0x1p-66).
+
+Note: 1) ensures that k*ln2hi + logc can be computed without rounding error,
+2) ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to
+a single rounding error when there is no fast fma for z*invc - 1, 3) ensures
+that logc + poly(z/c - 1) has small error, however near x == 1 when
+|log(x)| < 0x1p-4, this is not enough so that is special cased.  */
+.tab = {
+#if N == 64
+{0x1.7242886495cd8p+0, -0x1.79e267bdfe000p-2},
+{0x1.6e1f769340dc9p+0, -0x1.6e60ee0ecb000p-2},
+{0x1.6a13ccc8f195cp+0, -0x1.63002fdbf6000p-2},
+{0x1.661ec72e86f3ap+0, -0x1.57bf76c597000p-2},
+{0x1.623fa6c447b16p+0, -0x1.4c9e07f0d2000p-2},
+{0x1.5e75bbca31702p+0, -0x1.419b42f027000p-2},
+{0x1.5ac05655adb10p+0, -0x1.36b67660e6000p-2},
+{0x1.571ed3e940191p+0, -0x1.2bef0839e4800p-2},
+{0x1.539094ac0fbbfp+0, -0x1.21445727cb000p-2},
+{0x1.5015007e7fc42p+0, -0x1.16b5ca3c3d000p-2},
+{0x1.4cab877c31cf9p+0, -0x1.0c42d3805f800p-2},
+{0x1.49539e76a88d3p+0, -0x1.01eae61b60800p-2},
+{0x1.460cbc12211dap+0, -0x1.ef5adb9fb0000p-3},
+{0x1.42d6624debe3ap+0, -0x1.db13daab99000p-3},
+{0x1.3fb0144f0d462p+0, -0x1.c6ffbe896e000p-3},
+{0x1.3c995a1f9a9b4p+0, -0x1.b31d84722d000p-3},
+{0x1.3991c23952500p+0, -0x1.9f6c3cf6eb000p-3},
+{0x1.3698df35eaa14p+0, -0x1.8beafe7f13000p-3},
+{0x1.33ae463091760p+0, -0x1.7898db878d000p-3},
+{0x1.30d190aae3d72p+0, -0x1.6574efe4ec000p-3},
+{0x1.2e025c9203c89p+0, -0x1.527e620845000p-3},
+{0x1.2b404a7244988p+0, -0x1.3fb457d798000p-3},
+{0x1.288b01dc19544p+0, -0x1.2d1615a077000p-3},
+{0x1.25e2268085f69p+0, -0x1.1aa2b431e5000p-3},
+{0x1.23456812abb74p+0, -0x1.08598f1d2b000p-3},
+{0x1.20b4703174157p+0, -0x1.ec738fee40000p-4},
+{0x1.1e2ef308b4e9bp+0, -0x1.c885768862000p-4},
+{0x1.1bb4a36b70a3fp+0, -0x1.a4e75b6a46000p-4},
+{0x1.194538e960658p+0, -0x1.8197efba9a000p-4},
+{0x1.16e0692a10ac8p+0, -0x1.5e95ad734e000p-4},
+{0x1.1485f1ba1568bp+0, -0x1.3bdf67117c000p-4},
+{0x1.12358e123ed6fp+0, -0x1.1973b744f0000p-4},
+{0x1.0fef01de37c8dp+0, -0x1.eea33446bc000p-5},
+{0x1.0db20b82be414p+0, -0x1.aaef4ab304000p-5},
+{0x1.0b7e6f67f69b3p+0, -0x1.67c962fd2c000p-5},
+{0x1.0953f342fc108p+0, -0x1.252f29acf8000p-5},
+{0x1.0732604ec956bp+0, -0x1.c63d19e9c0000p-6},
+{0x1.051980117f9b0p+0, -0x1.432ab6a388000p-6},
+{0x1.03091aa6810f1p+0, -0x1.8244357f50000p-7},
+{0x1.01010152cf066p+0, -0x1.0080a711c0000p-8},
+{0x1.fc07ef6b6e30bp-1, 0x1.fe03018e80000p-8},
+{0x1.f4465aa1024afp-1, 0x1.7b91986450000p-6},
+{0x1.ecc07a8fd3f5ep-1, 0x1.39e88608c8000p-5},
+{0x1.e573ad856b537p-1, 0x1.b42dc6e624000p-5},
+{0x1.de5d6dc7b8057p-1, 0x1.165372ec20000p-4},
+{0x1.d77b6498bddf7p-1, 0x1.51b07a0170000p-4},
+{0x1.d0cb580315c0fp-1, 0x1.8c3465c7ea000p-4},
+{0x1.ca4b30d1cf449p-1, 0x1.c5e544a290000p-4},
+{0x1.c3f8ef4810d8ep-1, 0x1.fec91aa0a6000p-4},
+{0x1.bdd2b8b311f44p-1, 0x1.1b72acdc5c000p-3},
+{0x1.b7d6c2eeac054p-1, 0x1.371fc65a98000p-3},
+{0x1.b20363474c8f5p-1, 0x1.526e61c1aa000p-3},
+{0x1.ac570165eeab1p-1, 0x1.6d60ffc240000p-3},
+{0x1.a6d019f331df4p-1, 0x1.87fa08a013000p-3},
+{0x1.a16d3ebc9e3c3p-1, 0x1.a23bc630c3000p-3},
+{0x1.9c2d14567ef45p-1, 0x1.bc286a3512000p-3},
+{0x1.970e4efae9169p-1, 0x1.d5c2195697000p-3},
+{0x1.920fb3bd0b802p-1, 0x1.ef0ae132d3000p-3},
+{0x1.8d3018b58699ap-1, 0x1.040259974e000p-2},
+{0x1.886e5ff170ee6p-1, 0x1.1058bd40e2000p-2},
+{0x1.83c977ad35d27p-1, 0x1.1c898c1137800p-2},
+{0x1.7f405ed16c520p-1, 0x1.2895a3e65b000p-2},
+{0x1.7ad220d0335c4p-1, 0x1.347dd8f6bd000p-2},
+{0x1.767dce53474fdp-1, 0x1.4043083cb3800p-2},
+#elif N == 128
+{0x1.734f0c3e0de9fp+0, -0x1.7cc7f79e69000p-2},
+{0x1.713786a2ce91fp+0, -0x1.76feec20d0000p-2},
+{0x1.6f26008fab5a0p+0, -0x1.713e31351e000p-2},
+{0x1.6d1a61f138c7dp+0, -0x1.6b85b38287800p-2},
+{0x1.6b1490bc5b4d1p+0, -0x1.65d5590807800p-2},
+{0x1.69147332f0cbap+0, -0x1.602d076180000p-2},
+{0x1.6719f18224223p+0, -0x1.5a8ca86909000p-2},
+{0x1.6524f99a51ed9p+0, -0x1.54f4356035000p-2},
+{0x1.63356aa8f24c4p+0, -0x1.4f637c36b4000p-2},
+{0x1.614b36b9ddc14p+0, -0x1.49da7fda85000p-2},
+{0x1.5f66452c65c4cp+0, -0x1.445923989a800p-2},
+{0x1.5d867b5912c4fp+0, -0x1.3edf439b0b800p-2},
+{0x1.5babccb5b90dep+0, -0x1.396ce448f7000p-2},
+{0x1.59d61f2d91a78p+0, -0x1.3401e17bda000p-2},
+{0x1.5805612465687p+0, -0x1.2e9e2ef468000p-2},
+{0x1.56397cee76bd3p+0, -0x1.2941b3830e000p-2},
+{0x1.54725e2a77f93p+0, -0x1.23ec58cda8800p-2},
+{0x1.52aff42064583p+0, -0x1.1e9e129279000p-2},
+{0x1.50f22dbb2bddfp+0, -0x1.1956d2b48f800p-2},
+{0x1.4f38f4734ded7p+0, -0x1.141679ab9f800p-2},
+{0x1.4d843cfde2840p+0, -0x1.0edd094ef9800p-2},
+{0x1.4bd3ec078a3c8p+0, -0x1.09aa518db1000p-2},
+{0x1.4a27fc3e0258ap+0, -0x1.047e65263b800p-2},
+{0x1.4880524d48434p+0, -0x1.feb224586f000p-3},
+{0x1.46dce1b192d0bp+0, -0x1.f474a7517b000p-3},
+{0x1.453d9d3391854p+0, -0x1.ea4443d103000p-3},
+{0x1.43a2744b4845ap+0, -0x1.e020d44e9b000p-3},
+{0x1.420b54115f8fbp+0, -0x1.d60a22977f000p-3},
+{0x1.40782da3ef4b1p+0, -0x1.cc00104959000p-3},
+{0x1.3ee8f5d57fe8fp+0, -0x1.c202956891000p-3},
+{0x1.3d5d9a00b4ce9p+0, -0x1.b81178d811000p-3},
+{0x1.3bd60c010c12bp+0, -0x1.ae2c9ccd3d000p-3},
+{0x1.3a5242b75dab8p+0, -0x1.a45402e129000p-3},
+{0x1.38d22cd9fd002p+0, -0x1.9a877681df000p-3},
+{0x1.3755bc5847a1cp+0, -0x1.90c6d69483000p-3},
+{0x1.35dce49ad36e2p+0, -0x1.87120a645c000p-3},
+{0x1.34679984dd440p+0, -0x1.7d68fb4143000p-3},
+{0x1.32f5cceffcb24p+0, -0x1.73cb83c627000p-3},
+{0x1.3187775a10d49p+0, -0x1.6a39a9b376000p-3},
+{0x1.301c8373e3990p+0, -0x1.60b3154b7a000p-3},
+{0x1.2eb4ebb95f841p+0, -0x1.5737d76243000p-3},
+{0x1.2d50a0219a9d1p+0, -0x1.4dc7b8fc23000p-3},
+{0x1.2bef9a8b7fd2ap+0, -0x1.4462c51d20000p-3},
+{0x1.2a91c7a0c1babp+0, -0x1.3b08abc830000p-3},
+{0x1.293726014b530p+0, -0x1.31b996b490000p-3},
+{0x1.27dfa5757a1f5p+0, -0x1.2875490a44000p-3},
+{0x1.268b39b1d3bbfp+0, -0x1.1f3b9f879a000p-3},
+{0x1.2539d838ff5bdp+0, -0x1.160c8252ca000p-3},
+{0x1.23eb7aac9083bp+0, -0x1.0ce7f57f72000p-3},
+{0x1.22a012ba940b6p+0, -0x1.03cdc49fea000p-3},
+{0x1.2157996cc4132p+0, -0x1.f57bdbc4b8000p-4},
+{0x1.201201dd2fc9bp+0, -0x1.e370896404000p-4},
+{0x1.1ecf4494d480bp+0, -0x1.d17983ef94000p-4},
+{0x1.1d8f5528f6569p+0, -0x1.bf9674ed8a000p-4},
+{0x1.1c52311577e7cp+0, -0x1.adc79202f6000p-4},
+{0x1.1b17c74cb26e9p+0, -0x1.9c0c3e7288000p-4},
+{0x1.19e010c2c1ab6p+0, -0x1.8a646b372c000p-4},
+{0x1.18ab07bb670bdp+0, -0x1.78d01b3ac0000p-4},
+{0x1.1778a25efbcb6p+0, -0x1.674f145380000p-4},
+{0x1.1648d354c31dap+0, -0x1.55e0e6d878000p-4},
+{0x1.151b990275fddp+0, -0x1.4485cdea1e000p-4},
+{0x1.13f0ea432d24cp+0, -0x1.333d94d6aa000p-4},
+{0x1.12c8b7210f9dap+0, -0x1.22079f8c56000p-4},
+{0x1.11a3028ecb531p+0, -0x1.10e4698622000p-4},
+{0x1.107fbda8434afp+0, -0x1.ffa6c6ad20000p-5},
+{0x1.0f5ee0f4e6bb3p+0, -0x1.dda8d4a774000p-5},
+{0x1.0e4065d2a9fcep+0, -0x1.bbcece4850000p-5},
+{0x1.0d244632ca521p+0, -0x1.9a1894012c000p-5},
+{0x1.0c0a77ce2981ap+0, -0x1.788583302c000p-5},
+{0x1.0af2f83c636d1p+0, -0x1.5715e67d68000p-5},
+{0x1.09ddb98a01339p+0, -0x1.35c8a49658000p-5},
+{0x1.08cabaf52e7dfp+0, -0x1.149e364154000p-5},
+{0x1.07b9f2f4e28fbp+0, -0x1.e72c082eb8000p-6},
+{0x1.06ab58c358f19p+0, -0x1.a55f152528000p-6},
+{0x1.059eea5ecf92cp+0, -0x1.63d62cf818000p-6},
+{0x1.04949cdd12c90p+0, -0x1.228fb8caa0000p-6},
+{0x1.038c6c6f0ada9p+0, -0x1.c317b20f90000p-7},
+{0x1.02865137932a9p+0, -0x1.419355daa0000p-7},
+{0x1.0182427ea7348p+0, -0x1.81203c2ec0000p-8},
+{0x1.008040614b195p+0, -0x1.0040979240000p-9},
+{0x1.fe01ff726fa1ap-1, 0x1.feff384900000p-9},
+{0x1.fa11cc261ea74p-1, 0x1.7dc41353d0000p-7},
+{0x1.f6310b081992ep-1, 0x1.3cea3c4c28000p-6},
+{0x1.f25f63ceeadcdp-1, 0x1.b9fc114890000p-6},
+{0x1.ee9c8039113e7p-1, 0x1.1b0d8ce110000p-5},
+{0x1.eae8078cbb1abp-1, 0x1.58a5bd001c000p-5},
+{0x1.e741aa29d0c9bp-1, 0x1.95c8340d88000p-5},
+{0x1.e3a91830a99b5p-1, 0x1.d276aef578000p-5},
+{0x1.e01e009609a56p-1, 0x1.07598e598c000p-4},
+{0x1.dca01e577bb98p-1, 0x1.253f5e30d2000p-4},
+{0x1.d92f20b7c9103p-1, 0x1.42edd8b380000p-4},
+{0x1.d5cac66fb5ccep-1, 0x1.606598757c000p-4},
+{0x1.d272caa5ede9dp-1, 0x1.7da76356a0000p-4},
+{0x1.cf26e3e6b2ccdp-1, 0x1.9ab434e1c6000p-4},
+{0x1.cbe6da2a77902p-1, 0x1.b78c7bb0d6000p-4},
+{0x1.c8b266d37086dp-1, 0x1.d431332e72000p-4},
+{0x1.c5894bd5d5804p-1, 0x1.f0a3171de6000p-4},
+{0x1.c26b533bb9f8cp-1, 0x1.067152b914000p-3},
+{0x1.bf583eeece73fp-1, 0x1.147858292b000p-3},
+{0x1.bc4fd75db96c1p-1, 0x1.2266ecdca3000p-3},
+{0x1.b951e0c864a28p-1, 0x1.303d7a6c55000p-3},
+{0x1.b65e2c5ef3e2cp-1, 0x1.3dfc33c331000p-3},
+{0x1.b374867c9888bp-1, 0x1.4ba366b7a8000p-3},
+{0x1.b094b211d304ap-1, 0x1.5933928d1f000p-3},
+{0x1.adbe885f2ef7ep-1, 0x1.66acd2418f000p-3},
+{0x1.aaf1d31603da2p-1, 0x1.740f8ec669000p-3},
+{0x1.a82e63fd358a7p-1, 0x1.815c0f51af000p-3},
+{0x1.a5740ef09738bp-1, 0x1.8e92954f68000p-3},
+{0x1.a2c2a90ab4b27p-1, 0x1.9bb3602f84000p-3},
+{0x1.a01a01393f2d1p-1, 0x1.a8bed1c2c0000p-3},
+{0x1.9d79f24db3c1bp-1, 0x1.b5b515c01d000p-3},
+{0x1.9ae2505c7b190p-1, 0x1.c2967ccbcc000p-3},
+{0x1.9852ef297ce2fp-1, 0x1.cf635d5486000p-3},
+{0x1.95cbaeea44b75p-1, 0x1.dc1bd3446c000p-3},
+{0x1.934c69de74838p-1, 0x1.e8c01b8cfe000p-3},
+{0x1.90d4f2f6752e6p-1, 0x1.f5509c0179000p-3},
+{0x1.8e6528effd79dp-1, 0x1.00e6c121fb800p-2},
+{0x1.8bfce9fcc007cp-1, 0x1.071b80e93d000p-2},
+{0x1.899c0dabec30ep-1, 0x1.0d46b9e867000p-2},
+{0x1.87427aa2317fbp-1, 0x1.13687334bd000p-2},
+{0x1.84f00acb39a08p-1, 0x1.1980d67234800p-2},
+{0x1.82a49e8653e55p-1, 0x1.1f8ffe0cc8000p-2},
+{0x1.8060195f40260p-1, 0x1.2595fd7636800p-2},
+{0x1.7e22563e0a329p-1, 0x1.2b9300914a800p-2},
+{0x1.7beb377dcb5adp-1, 0x1.3187210436000p-2},
+{0x1.79baa679725c2p-1, 0x1.377266dec1800p-2},
+{0x1.77907f2170657p-1, 0x1.3d54ffbaf3000p-2},
+{0x1.756cadbd6130cp-1, 0x1.432eee32fe000p-2},
+#endif
+},
+#if !HAVE_FAST_FMA
+.tab2 = {
+#if N == 64
+{0x1.61ffff94c4fecp-1, -0x1.9fe4fc998f325p-56},
+{0x1.66000020377ddp-1, 0x1.e804c7a9519f2p-55},
+{0x1.6a00004c41678p-1, 0x1.902c675d9ecfep-55},
+{0x1.6dffff7384f87p-1, -0x1.2fd6b95e55043p-56},
+{0x1.720000b37216ep-1, 0x1.802bc8d437043p-55},
+{0x1.75ffffbeb3c9dp-1, 0x1.6047ad0a0d4e4p-57},
+{0x1.7a0000628daep-1, -0x1.e00434b49313dp-56},
+{0x1.7dffffd7abd1ap-1, -0x1.6015f8a083576p-56},
+{0x1.81ffffdf40c54p-1, 0x1.7f54bf76a42c9p-57},
+{0x1.860000f334e11p-1, 0x1.60054cb5344d7p-56},
+{0x1.8a0001238aca7p-1, 0x1.c03c9bd132f55p-57},
+{0x1.8dffffb81d212p-1, -0x1.001e519f2764fp-55},
+{0x1.92000086adc7cp-1, 0x1.1fe40f88f49c6p-55},
+{0x1.960000135d8eap-1, -0x1.f832268dc3095p-55},
+{0x1.99ffff9435acp-1, 0x1.7031d8b835edcp-56},
+{0x1.9e00003478565p-1, -0x1.0030b221ce3eep-58},
+{0x1.a20000b592948p-1, 0x1.8fd2f1dbd4639p-55},
+{0x1.a600000ad0bcfp-1, 0x1.901d6a974e6bep-55},
+{0x1.a9ffff55953a5p-1, 0x1.a07556192db98p-57},
+{0x1.adffff29ce03dp-1, -0x1.fff0717ec71c2p-56},
+{0x1.b1ffff34f3ac8p-1, 0x1.8005573de89d1p-57},
+{0x1.b60000894c55bp-1, -0x1.ff2fb51b044c7p-57},
+{0x1.b9fffef45ec7dp-1, -0x1.9ff7c4e8730fp-56},
+{0x1.be0000cda7b2ap-1, 0x1.57d058dbf3c1dp-55},
+{0x1.c1ffff2c57917p-1, 0x1.7e66d7e48dbc9p-58},
+{0x1.c60000ea5b82ap-1, -0x1.47f5e132ed4bep-55},
+{0x1.ca0001121ae98p-1, -0x1.40958c8d5e00ap-58},
+{0x1.ce0000f9241cbp-1, -0x1.7da063caa81c8p-59},
+{0x1.d1fffe8be95a4p-1, -0x1.82e3a411afcd9p-59},
+{0x1.d5ffff035932bp-1, -0x1.00f901b3fe87dp-58},
+{0x1.d9fffe8b54ba7p-1, 0x1.ffef55d6e3a4p-55},
+{0x1.de0000ad95d19p-1, 0x1.5feb2efd4c7c7p-55},
+{0x1.e1fffe925ce47p-1, 0x1.c8085484eaf08p-55},
+{0x1.e5fffe3ddf853p-1, -0x1.fd5ed02c5cadp-60},
+{0x1.e9fffed0a0e5fp-1, -0x1.a80aaef411586p-55},
+{0x1.ee00008f82eep-1, -0x1.b000aeaf97276p-55},
+{0x1.f20000a22d2f4p-1, -0x1.8f8906e13eba3p-56},
+{0x1.f5fffee35b57dp-1, 0x1.1fdd33b2d3714p-57},
+{0x1.fa00014eec3a6p-1, -0x1.3ee0b7a18c1a5p-58},
+{0x1.fdffff5daa89fp-1, -0x1.c1e24c8e3b503p-58},
+{0x1.0200005b93349p+0, -0x1.50197fe6bedcap-54},
+{0x1.05ffff9d597acp+0, 0x1.20160d062d0dcp-55},
+{0x1.0a00005687a63p+0, -0x1.27f3f9307696ep-54},
+{0x1.0dffff779164ep+0, 0x1.b7eb40bb9c4f4p-54},
+{0x1.12000044a0aa8p+0, 0x1.efbc914d512c4p-55},
+{0x1.16000069685bcp+0, -0x1.c0bea3eb2d82cp-57},
+{0x1.1a000093f0d78p+0, 0x1.1fecbf1e8c52p-54},
+{0x1.1dffffb2b1457p+0, -0x1.3fc91365637d6p-55},
+{0x1.2200008824a1p+0, -0x1.dff7e9feb578ap-54},
+{0x1.25ffffeef953p+0, -0x1.b00a61ec912f7p-55},
+{0x1.2a0000a1e7783p+0, 0x1.60048318b0483p-56},
+{0x1.2e0000853d4c7p+0, -0x1.77fbedf2c8cf3p-54},
+{0x1.320000324c55bp+0, 0x1.f81983997354fp-54},
+{0x1.360000594f796p+0, -0x1.cfe4beff900a9p-54},
+{0x1.3a0000a4c1c0fp+0, 0x1.07dbb2e268d0ep-54},
+{0x1.3e0000751c61bp+0, 0x1.80583ed1c566ep-56},
+{0x1.42000069e8a9fp+0, 0x1.f01f1edf82045p-54},
+{0x1.460000b5a1e34p+0, -0x1.dfdf0cf45c14ap-55},
+{0x1.4a0000187e513p+0, 0x1.401306b83a98dp-55},
+{0x1.4dffff3ba420bp+0, 0x1.9fc6539a6454ep-56},
+{0x1.51fffffe391c9p+0, -0x1.601ef3353ac83p-54},
+{0x1.560000e342455p+0, 0x1.3fb7fac8ac151p-55},
+{0x1.59ffffc39676fp+0, 0x1.4fe7dd6659cc2p-55},
+{0x1.5dfffff10ef42p+0, -0x1.48154cb592bcbp-54},
+#elif N == 128
+{0x1.61000014fb66bp-1, 0x1.e026c91425b3cp-56},
+{0x1.63000034db495p-1, 0x1.dbfea48005d41p-55},
+{0x1.650000d94d478p-1, 0x1.e7fa786d6a5b7p-55},
+{0x1.67000074e6fadp-1, 0x1.1fcea6b54254cp-57},
+{0x1.68ffffedf0faep-1, -0x1.c7e274c590efdp-56},
+{0x1.6b0000763c5bcp-1, -0x1.ac16848dcda01p-55},
+{0x1.6d0001e5cc1f6p-1, 0x1.33f1c9d499311p-55},
+{0x1.6efffeb05f63ep-1, -0x1.e80041ae22d53p-56},
+{0x1.710000e86978p-1, 0x1.bff6671097952p-56},
+{0x1.72ffffc67e912p-1, 0x1.c00e226bd8724p-55},
+{0x1.74fffdf81116ap-1, -0x1.e02916ef101d2p-57},
+{0x1.770000f679c9p-1, -0x1.7fc71cd549c74p-57},
+{0x1.78ffffa7ec835p-1, 0x1.1bec19ef50483p-55},
+{0x1.7affffe20c2e6p-1, -0x1.07e1729cc6465p-56},
+{0x1.7cfffed3fc9p-1, -0x1.08072087b8b1cp-55},
+{0x1.7efffe9261a76p-1, 0x1.dc0286d9df9aep-55},
+{0x1.81000049ca3e8p-1, 0x1.97fd251e54c33p-55},
+{0x1.8300017932c8fp-1, -0x1.afee9b630f381p-55},
+{0x1.850000633739cp-1, 0x1.9bfbf6b6535bcp-55},
+{0x1.87000204289c6p-1, -0x1.bbf65f3117b75p-55},
+{0x1.88fffebf57904p-1, -0x1.9006ea23dcb57p-55},
+{0x1.8b00022bc04dfp-1, -0x1.d00df38e04b0ap-56},
+{0x1.8cfffe50c1b8ap-1, -0x1.8007146ff9f05p-55},
+{0x1.8effffc918e43p-1, 0x1.3817bd07a7038p-55},
+{0x1.910001efa5fc7p-1, 0x1.93e9176dfb403p-55},
+{0x1.9300013467bb9p-1, 0x1.f804e4b980276p-56},
+{0x1.94fffe6ee076fp-1, -0x1.f7ef0d9ff622ep-55},
+{0x1.96fffde3c12d1p-1, -0x1.082aa962638bap-56},
+{0x1.98ffff4458a0dp-1, -0x1.7801b9164a8efp-55},
+{0x1.9afffdd982e3ep-1, -0x1.740e08a5a9337p-55},
+{0x1.9cfffed49fb66p-1, 0x1.fce08c19bep-60},
+{0x1.9f00020f19c51p-1, -0x1.a3faa27885b0ap-55},
+{0x1.a10001145b006p-1, 0x1.4ff489958da56p-56},
+{0x1.a300007bbf6fap-1, 0x1.cbeab8a2b6d18p-55},
+{0x1.a500010971d79p-1, 0x1.8fecadd78793p-55},
+{0x1.a70001df52e48p-1, -0x1.f41763dd8abdbp-55},
+{0x1.a90001c593352p-1, -0x1.ebf0284c27612p-55},
+{0x1.ab0002a4f3e4bp-1, -0x1.9fd043cff3f5fp-57},
+{0x1.acfffd7ae1ed1p-1, -0x1.23ee7129070b4p-55},
+{0x1.aefffee510478p-1, 0x1.a063ee00edea3p-57},
+{0x1.b0fffdb650d5bp-1, 0x1.a06c8381f0ab9p-58},
+{0x1.b2ffffeaaca57p-1, -0x1.9011e74233c1dp-56},
+{0x1.b4fffd995badcp-1, -0x1.9ff1068862a9fp-56},
+{0x1.b7000249e659cp-1, 0x1.aff45d0864f3ep-55},
+{0x1.b8ffff987164p-1, 0x1.cfe7796c2c3f9p-56},
+{0x1.bafffd204cb4fp-1, -0x1.3ff27eef22bc4p-57},
+{0x1.bcfffd2415c45p-1, -0x1.cffb7ee3bea21p-57},
+{0x1.beffff86309dfp-1, -0x1.14103972e0b5cp-55},
+{0x1.c0fffe1b57653p-1, 0x1.bc16494b76a19p-55},
+{0x1.c2ffff1fa57e3p-1, -0x1.4feef8d30c6edp-57},
+{0x1.c4fffdcbfe424p-1, -0x1.43f68bcec4775p-55},
+{0x1.c6fffed54b9f7p-1, 0x1.47ea3f053e0ecp-55},
+{0x1.c8fffeb998fd5p-1, 0x1.383068df992f1p-56},
+{0x1.cb0002125219ap-1, -0x1.8fd8e64180e04p-57},
+{0x1.ccfffdd94469cp-1, 0x1.e7ebe1cc7ea72p-55},
+{0x1.cefffeafdc476p-1, 0x1.ebe39ad9f88fep-55},
+{0x1.d1000169af82bp-1, 0x1.57d91a8b95a71p-56},
+{0x1.d30000d0ff71dp-1, 0x1.9c1906970c7dap-55},
+{0x1.d4fffea790fc4p-1, -0x1.80e37c558fe0cp-58},
+{0x1.d70002edc87e5p-1, -0x1.f80d64dc10f44p-56},
+{0x1.d900021dc82aap-1, -0x1.47c8f94fd5c5cp-56},
+{0x1.dafffd86b0283p-1, 0x1.c7f1dc521617ep-55},
+{0x1.dd000296c4739p-1, 0x1.8019eb2ffb153p-55},
+{0x1.defffe54490f5p-1, 0x1.e00d2c652cc89p-57},
+{0x1.e0fffcdabf694p-1, -0x1.f8340202d69d2p-56},
+{0x1.e2fffdb52c8ddp-1, 0x1.b00c1ca1b0864p-56},
+{0x1.e4ffff24216efp-1, 0x1.2ffa8b094ab51p-56},
+{0x1.e6fffe88a5e11p-1, -0x1.7f673b1efbe59p-58},
+{0x1.e9000119eff0dp-1, -0x1.4808d5e0bc801p-55},
+{0x1.eafffdfa51744p-1, 0x1.80006d54320b5p-56},
+{0x1.ed0001a127fa1p-1, -0x1.002f860565c92p-58},
+{0x1.ef00007babcc4p-1, -0x1.540445d35e611p-55},
+{0x1.f0ffff57a8d02p-1, -0x1.ffb3139ef9105p-59},
+{0x1.f30001ee58ac7p-1, 0x1.a81acf2731155p-55},
+{0x1.f4ffff5823494p-1, 0x1.a3f41d4d7c743p-55},
+{0x1.f6ffffca94c6bp-1, -0x1.202f41c987875p-57},
+{0x1.f8fffe1f9c441p-1, 0x1.77dd1f477e74bp-56},
+{0x1.fafffd2e0e37ep-1, -0x1.f01199a7ca331p-57},
+{0x1.fd0001c77e49ep-1, 0x1.181ee4bceacb1p-56},
+{0x1.feffff7e0c331p-1, -0x1.e05370170875ap-57},
+{0x1.00ffff465606ep+0, -0x1.a7ead491c0adap-55},
+{0x1.02ffff3867a58p+0, -0x1.77f69c3fcb2ep-54},
+{0x1.04ffffdfc0d17p+0, 0x1.7bffe34cb945bp-54},
+{0x1.0700003cd4d82p+0, 0x1.20083c0e456cbp-55},
+{0x1.08ffff9f2cbe8p+0, -0x1.dffdfbe37751ap-57},
+{0x1.0b000010cda65p+0, -0x1.13f7faee626ebp-54},
+{0x1.0d00001a4d338p+0, 0x1.07dfa79489ff7p-55},
+{0x1.0effffadafdfdp+0, -0x1.7040570d66bcp-56},
+{0x1.110000bbafd96p+0, 0x1.e80d4846d0b62p-55},
+{0x1.12ffffae5f45dp+0, 0x1.dbffa64fd36efp-54},
+{0x1.150000dd59ad9p+0, 0x1.a0077701250aep-54},
+{0x1.170000f21559ap+0, 0x1.dfdf9e2e3deeep-55},
+{0x1.18ffffc275426p+0, 0x1.10030dc3b7273p-54},
+{0x1.1b000123d3c59p+0, 0x1.97f7980030188p-54},
+{0x1.1cffff8299eb7p+0, -0x1.5f932ab9f8c67p-57},
+{0x1.1effff48ad4p+0, 0x1.37fbf9da75bebp-54},
+{0x1.210000c8b86a4p+0, 0x1.f806b91fd5b22p-54},
+{0x1.2300003854303p+0, 0x1.3ffc2eb9fbf33p-54},
+{0x1.24fffffbcf684p+0, 0x1.601e77e2e2e72p-56},
+{0x1.26ffff52921d9p+0, 0x1.ffcbb767f0c61p-56},
+{0x1.2900014933a3cp+0, -0x1.202ca3c02412bp-56},
+{0x1.2b00014556313p+0, -0x1.2808233f21f02p-54},
+{0x1.2cfffebfe523bp+0, -0x1.8ff7e384fdcf2p-55},
+{0x1.2f0000bb8ad96p+0, -0x1.5ff51503041c5p-55},
+{0x1.30ffffb7ae2afp+0, -0x1.10071885e289dp-55},
+{0x1.32ffffeac5f7fp+0, -0x1.1ff5d3fb7b715p-54},
+{0x1.350000ca66756p+0, 0x1.57f82228b82bdp-54},
+{0x1.3700011fbf721p+0, 0x1.000bac40dd5ccp-55},
+{0x1.38ffff9592fb9p+0, -0x1.43f9d2db2a751p-54},
+{0x1.3b00004ddd242p+0, 0x1.57f6b707638e1p-55},
+{0x1.3cffff5b2c957p+0, 0x1.a023a10bf1231p-56},
+{0x1.3efffeab0b418p+0, 0x1.87f6d66b152bp-54},
+{0x1.410001532aff4p+0, 0x1.7f8375f198524p-57},
+{0x1.4300017478b29p+0, 0x1.301e672dc5143p-55},
+{0x1.44fffe795b463p+0, 0x1.9ff69b8b2895ap-55},
+{0x1.46fffe80475ep+0, -0x1.5c0b19bc2f254p-54},
+{0x1.48fffef6fc1e7p+0, 0x1.b4009f23a2a72p-54},
+{0x1.4afffe5bea704p+0, -0x1.4ffb7bf0d7d45p-54},
+{0x1.4d000171027dep+0, -0x1.9c06471dc6a3dp-54},
+{0x1.4f0000ff03ee2p+0, 0x1.77f890b85531cp-54},
+{0x1.5100012dc4bd1p+0, 0x1.004657166a436p-57},
+{0x1.530001605277ap+0, -0x1.6bfcece233209p-54},
+{0x1.54fffecdb704cp+0, -0x1.902720505a1d7p-55},
+{0x1.56fffef5f54a9p+0, 0x1.bbfe60ec96412p-54},
+{0x1.5900017e61012p+0, 0x1.87ec581afef9p-55},
+{0x1.5b00003c93e92p+0, -0x1.f41080abf0ccp-54},
+{0x1.5d0001d4919bcp+0, -0x1.8812afb254729p-54},
+{0x1.5efffe7b87a89p+0, -0x1.47eb780ed6904p-54},
+#endif
+},
+#endif /* !HAVE_FAST_FMA */
+};
diff --git a/pl/math/logf.c b/pl/math/logf.c
new file mode 100644
index 0000000..17a74ed
--- /dev/null
+++ b/pl/math/logf.c
@@ -0,0 +1,75 @@
+/*
+ * Single-precision log function.
+ *
+ * Copyright (c) 2017-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include <math.h>
+#include <stdint.h>
+#include "math_config.h"
+
+/*
+LOGF_TABLE_BITS = 4
+LOGF_POLY_ORDER = 4
+
+ULP error: 0.818 (nearest rounding.)
+Relative error: 1.957 * 2^-26 (before rounding.)
+*/
+
+#define T __logf_data.tab
+#define A __logf_data.poly
+#define Ln2 __logf_data.ln2
+#define N (1 << LOGF_TABLE_BITS)
+#define OFF 0x3f330000
+
+float
+optr_aor_log_f32 (float x)
+{
+  /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
+  double_t z, r, r2, y, y0, invc, logc;
+  uint32_t ix, iz, tmp;
+  int k, i;
+
+  ix = asuint (x);
+#if WANT_ROUNDING
+  /* Fix sign of zero with downward rounding when x==1.  */
+  if (unlikely (ix == 0x3f800000))
+    return 0;
+#endif
+  if (unlikely (ix - 0x00800000 >= 0x7f800000 - 0x00800000))
+    {
+      /* x < 0x1p-126 or inf or nan.  */
+      if (ix * 2 == 0)
+	return __math_divzerof (1);
+      if (ix == 0x7f800000) /* log(inf) == inf.  */
+	return x;
+      if ((ix & 0x80000000) || ix * 2 >= 0xff000000)
+	return __math_invalidf (x);
+      /* x is subnormal, normalize it.  */
+      ix = asuint (x * 0x1p23f);
+      ix -= 23 << 23;
+    }
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF] and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  tmp = ix - OFF;
+  i = (tmp >> (23 - LOGF_TABLE_BITS)) % N;
+  k = (int32_t) tmp >> 23; /* arithmetic shift */
+  iz = ix - (tmp & 0x1ff << 23);
+  invc = T[i].invc;
+  logc = T[i].logc;
+  z = (double_t) asfloat (iz);
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2 */
+  r = z * invc - 1;
+  y0 = logc + (double_t) k * Ln2;
+
+  /* Pipelined polynomial evaluation to approximate log1p(r).  */
+  r2 = r * r;
+  y = A[1] * r + A[2];
+  y = A[0] * r2 + y;
+  y = y * r2 + (y0 + r);
+  return eval_as_float (y);
+}
diff --git a/pl/math/logf_data.c b/pl/math/logf_data.c
new file mode 100644
index 0000000..97d9eb8
--- /dev/null
+++ b/pl/math/logf_data.c
@@ -0,0 +1,36 @@
+/*
+ * Data definition for logf and log10f.
+ *
+ * Copyright (c) 2017-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct logf_data __logf_data = {
+    .tab =
+        {
+            {0x1.661ec79f8f3bep+0, -0x1.57bf7808caadep-2},
+            {0x1.571ed4aaf883dp+0, -0x1.2bef0a7c06ddbp-2},
+            {0x1.49539f0f010bp+0, -0x1.01eae7f513a67p-2},
+            {0x1.3c995b0b80385p+0, -0x1.b31d8a68224e9p-3},
+            {0x1.30d190c8864a5p+0, -0x1.6574f0ac07758p-3},
+            {0x1.25e227b0b8eap+0, -0x1.1aa2bc79c81p-3},
+            {0x1.1bb4a4a1a343fp+0, -0x1.a4e76ce8c0e5ep-4},
+            {0x1.12358f08ae5bap+0, -0x1.1973c5a611cccp-4},
+            {0x1.0953f419900a7p+0, -0x1.252f438e10c1ep-5},
+            {0x1p+0, 0x0p+0},
+            {0x1.e608cfd9a47acp-1, 0x1.aa5aa5df25984p-5},
+            {0x1.ca4b31f026aap-1, 0x1.c5e53aa362eb4p-4},
+            {0x1.b2036576afce6p-1, 0x1.526e57720db08p-3},
+            {0x1.9c2d163a1aa2dp-1, 0x1.bc2860d22477p-3},
+            {0x1.886e6037841edp-1, 0x1.1058bc8a07ee1p-2},
+            {0x1.767dcf5534862p-1, 0x1.4043057b6ee09p-2},
+        },
+    .ln2 = 0x1.62e42fefa39efp-1,
+    .invln10 = 0x1.bcb7b1526e50ep-2,
+    .poly = {
+        -0x1.00ea348b88334p-2,
+        0x1.5575b0be00b6ap-2,
+        -0x1.ffffef20a4123p-2,
+    }};
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
new file mode 100644
index 0000000..dccb3ce
--- /dev/null
+++ b/pl/math/math_config.h
@@ -0,0 +1,572 @@
+/*
+ * Configuration for math routines.
+ *
+ * Copyright (c) 2017-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _MATH_CONFIG_H
+#define _MATH_CONFIG_H
+
+#include <math.h>
+#include <stdint.h>
+
+#ifndef WANT_ROUNDING
+/* If defined to 1, return correct results for special cases in non-nearest
+   rounding modes (logf (1.0f) returns 0.0f with FE_DOWNWARD rather than -0.0f).
+   This may be set to 0 if there is no fenv support or if math functions only
+   get called in round to nearest mode.  */
+# define WANT_ROUNDING 1
+#endif
+#ifndef WANT_ERRNO
+/* If defined to 1, set errno in math functions according to ISO C.  Many math
+   libraries do not set errno, so this is 0 by default.  It may need to be
+   set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0.  */
+# define WANT_ERRNO 0
+#endif
+#ifndef WANT_SIMD_EXCEPT
+/* If defined to 1, trigger fp exceptions in vector routines, consistently with
+   behaviour expected from the corresponding scalar routine.  */
+#define WANT_SIMD_EXCEPT 0
+#endif
+
+/* Compiler can inline round as a single instruction.  */
+#ifndef HAVE_FAST_ROUND
+# if __aarch64__
+#   define HAVE_FAST_ROUND 1
+# else
+#   define HAVE_FAST_ROUND 0
+# endif
+#endif
+
+/* Compiler can inline lround, but not (long)round(x).  */
+#ifndef HAVE_FAST_LROUND
+# if __aarch64__ && (100*__GNUC__ + __GNUC_MINOR__) >= 408 && __NO_MATH_ERRNO__
+#   define HAVE_FAST_LROUND 1
+# else
+#   define HAVE_FAST_LROUND 0
+# endif
+#endif
+
+/* Compiler can inline fma as a single instruction.  */
+#ifndef HAVE_FAST_FMA
+# if defined FP_FAST_FMA || __aarch64__
+#   define HAVE_FAST_FMA 1
+# else
+#   define HAVE_FAST_FMA 0
+# endif
+#endif
+
+/* Provide *_finite symbols and some of the glibc hidden symbols
+   so libmathlib can be used with binaries compiled against glibc
+   to interpose math functions with both static and dynamic linking.  */
+#ifndef USE_GLIBC_ABI
+# if __GNUC__
+#   define USE_GLIBC_ABI 1
+# else
+#   define USE_GLIBC_ABI 0
+# endif
+#endif
+
+/* Optionally used extensions.  */
+#ifdef __GNUC__
+# define HIDDEN __attribute__ ((__visibility__ ("hidden")))
+# define NOINLINE __attribute__ ((noinline))
+# define UNUSED __attribute__ ((unused))
+# define likely(x) __builtin_expect (!!(x), 1)
+# define unlikely(x) __builtin_expect (x, 0)
+# if __GNUC__ >= 9
+#   define attribute_copy(f) __attribute__ ((copy (f)))
+# else
+#   define attribute_copy(f)
+# endif
+# define strong_alias(f, a) \
+  extern __typeof (f) a __attribute__ ((alias (#f))) attribute_copy (f);
+# define hidden_alias(f, a) \
+  extern __typeof (f) a __attribute__ ((alias (#f), visibility ("hidden"))) \
+  attribute_copy (f);
+#else
+# define HIDDEN
+# define NOINLINE
+# define UNUSED
+# define likely(x) (x)
+# define unlikely(x) (x)
+#endif
+
+#if HAVE_FAST_ROUND
+/* When set, the roundtoint and converttoint functions are provided with
+   the semantics documented below.  */
+# define TOINT_INTRINSICS 1
+
+/* Round x to nearest int in all rounding modes, ties have to be rounded
+   consistently with converttoint so the results match.  If the result
+   would be outside of [-2^31, 2^31-1] then the semantics is unspecified.  */
+static inline double_t
+roundtoint (double_t x)
+{
+  return round (x);
+}
+
+/* Convert x to nearest int in all rounding modes, ties have to be rounded
+   consistently with roundtoint.  If the result is not representible in an
+   int32_t then the semantics is unspecified.  */
+static inline int32_t
+converttoint (double_t x)
+{
+# if HAVE_FAST_LROUND
+  return lround (x);
+# else
+  return (long) round (x);
+# endif
+}
+#endif
+
+static inline uint32_t
+asuint (float f)
+{
+  union
+  {
+    float f;
+    uint32_t i;
+  } u = {f};
+  return u.i;
+}
+
+static inline float
+asfloat (uint32_t i)
+{
+  union
+  {
+    uint32_t i;
+    float f;
+  } u = {i};
+  return u.f;
+}
+
+static inline uint64_t
+asuint64 (double f)
+{
+  union
+  {
+    double f;
+    uint64_t i;
+  } u = {f};
+  return u.i;
+}
+
+static inline double
+asdouble (uint64_t i)
+{
+  union
+  {
+    uint64_t i;
+    double f;
+  } u = {i};
+  return u.f;
+}
+
+#ifndef IEEE_754_2008_SNAN
+# define IEEE_754_2008_SNAN 1
+#endif
+static inline int
+issignalingf_inline (float x)
+{
+  uint32_t ix = asuint (x);
+  if (!IEEE_754_2008_SNAN)
+    return (ix & 0x7fc00000) == 0x7fc00000;
+  return 2 * (ix ^ 0x00400000) > 2u * 0x7fc00000;
+}
+
+static inline int
+issignaling_inline (double x)
+{
+  uint64_t ix = asuint64 (x);
+  if (!IEEE_754_2008_SNAN)
+    return (ix & 0x7ff8000000000000) == 0x7ff8000000000000;
+  return 2 * (ix ^ 0x0008000000000000) > 2 * 0x7ff8000000000000ULL;
+}
+
+#if __aarch64__ && __GNUC__
+/* Prevent the optimization of a floating-point expression.  */
+static inline float
+opt_barrier_float (float x)
+{
+  __asm__ __volatile__ ("" : "+w" (x));
+  return x;
+}
+static inline double
+opt_barrier_double (double x)
+{
+  __asm__ __volatile__ ("" : "+w" (x));
+  return x;
+}
+/* Force the evaluation of a floating-point expression for its side-effect.  */
+static inline void
+force_eval_float (float x)
+{
+  __asm__ __volatile__ ("" : "+w" (x));
+}
+static inline void
+force_eval_double (double x)
+{
+  __asm__ __volatile__ ("" : "+w" (x));
+}
+#else
+static inline float
+opt_barrier_float (float x)
+{
+  volatile float y = x;
+  return y;
+}
+static inline double
+opt_barrier_double (double x)
+{
+  volatile double y = x;
+  return y;
+}
+static inline void
+force_eval_float (float x)
+{
+  volatile float y UNUSED = x;
+}
+static inline void
+force_eval_double (double x)
+{
+  volatile double y UNUSED = x;
+}
+#endif
+
+/* Evaluate an expression as the specified type, normally a type
+   cast should be enough, but compilers implement non-standard
+   excess-precision handling, so when FLT_EVAL_METHOD != 0 then
+   these functions may need to be customized.  */
+static inline float
+eval_as_float (float x)
+{
+  return x;
+}
+static inline double
+eval_as_double (double x)
+{
+  return x;
+}
+
+/* Error handling tail calls for special cases, with a sign argument.
+   The sign of the return value is set if the argument is non-zero.  */
+
+/* The result overflows.  */
+HIDDEN float __math_oflowf (uint32_t);
+/* The result underflows to 0 in nearest rounding mode.  */
+HIDDEN float __math_uflowf (uint32_t);
+/* The result underflows to 0 in some directed rounding mode only.  */
+HIDDEN float __math_may_uflowf (uint32_t);
+/* Division by zero.  */
+HIDDEN float __math_divzerof (uint32_t);
+/* The result overflows.  */
+HIDDEN double __math_oflow (uint32_t);
+/* The result underflows to 0 in nearest rounding mode.  */
+HIDDEN double __math_uflow (uint32_t);
+/* The result underflows to 0 in some directed rounding mode only.  */
+HIDDEN double __math_may_uflow (uint32_t);
+/* Division by zero.  */
+HIDDEN double __math_divzero (uint32_t);
+
+/* Error handling using input checking.  */
+
+/* Invalid input unless it is a quiet NaN.  */
+HIDDEN float __math_invalidf (float);
+/* Invalid input unless it is a quiet NaN.  */
+HIDDEN double __math_invalid (double);
+
+/* Error handling using output checking, only for errno setting.  */
+
+/* Check if the result overflowed to infinity.  */
+HIDDEN double __math_check_oflow (double);
+/* Check if the result underflowed to 0.  */
+HIDDEN double __math_check_uflow (double);
+
+/* Check if the result overflowed to infinity.  */
+static inline double
+check_oflow (double x)
+{
+  return WANT_ERRNO ? __math_check_oflow (x) : x;
+}
+
+/* Check if the result underflowed to 0.  */
+static inline double
+check_uflow (double x)
+{
+  return WANT_ERRNO ? __math_check_uflow (x) : x;
+}
+
+/* Check if the result overflowed to infinity.  */
+HIDDEN float __math_check_oflowf (float);
+/* Check if the result underflowed to 0.  */
+HIDDEN float __math_check_uflowf (float);
+
+/* Check if the result overflowed to infinity.  */
+static inline float
+check_oflowf (float x)
+{
+  return WANT_ERRNO ? __math_check_oflowf (x) : x;
+}
+
+/* Check if the result underflowed to 0.  */
+static inline float
+check_uflowf (float x)
+{
+  return WANT_ERRNO ? __math_check_uflowf (x) : x;
+}
+
+extern const struct erff_data
+{
+  float erff_poly_A[6];
+  float erff_poly_B[7];
+} __erff_data HIDDEN;
+
+/* Data for logf and log10f.  */
+#define LOGF_TABLE_BITS 4
+#define LOGF_POLY_ORDER 4
+extern const struct logf_data
+{
+  struct
+  {
+    double invc, logc;
+  } tab[1 << LOGF_TABLE_BITS];
+  double ln2;
+  double invln10;
+  double poly[LOGF_POLY_ORDER - 1]; /* First order coefficient is 1.  */
+} __logf_data HIDDEN;
+
+/* Data for low accuracy log10 (with 1/ln(10) included in coefficients).  */
+#define LOG10_TABLE_BITS 7
+#define LOG10_POLY_ORDER 6
+#define LOG10_POLY1_ORDER 12
+extern const struct log10_data
+{
+  double ln2hi;
+  double ln2lo;
+  double invln10;
+  double poly[LOG10_POLY_ORDER - 1]; /* First coefficient is 1/log(10).  */
+  double poly1[LOG10_POLY1_ORDER - 1];
+  struct {double invc, logc;} tab[1 << LOG10_TABLE_BITS];
+#if !HAVE_FAST_FMA
+  struct {double chi, clo;} tab2[1 << LOG10_TABLE_BITS];
+#endif
+} __log10_data HIDDEN;
+
+#define EXP_TABLE_BITS 7
+#define EXP_POLY_ORDER 5
+/* Use polynomial that is optimized for a wider input range.  This may be
+   needed for good precision in non-nearest rounding and !TOINT_INTRINSICS.  */
+#define EXP_POLY_WIDE 0
+/* Use close to nearest rounding toint when !TOINT_INTRINSICS.  This may be
+   needed for good precision in non-nearest rouning and !EXP_POLY_WIDE.  */
+#define EXP_USE_TOINT_NARROW 0
+#define EXP2_POLY_ORDER 5
+#define EXP2_POLY_WIDE 0
+extern const struct exp_data
+{
+  double invln2N;
+  double shift;
+  double negln2hiN;
+  double negln2loN;
+  double poly[4]; /* Last four coefficients.  */
+  double exp2_shift;
+  double exp2_poly[EXP2_POLY_ORDER];
+  uint64_t tab[2*(1 << EXP_TABLE_BITS)];
+} __exp_data HIDDEN;
+
+#define ERFC_NUM_INTERVALS 20
+#define ERFC_POLY_ORDER 12
+extern const struct erfc_data
+{
+  double interval_bounds[ERFC_NUM_INTERVALS + 1];
+  double poly[ERFC_NUM_INTERVALS][ERFC_POLY_ORDER + 1];
+} __erfc_data HIDDEN;
+extern const struct v_erfc_data
+{
+  double interval_bounds[ERFC_NUM_INTERVALS + 1];
+  double poly[ERFC_NUM_INTERVALS + 1][ERFC_POLY_ORDER + 1];
+}  __v_erfc_data HIDDEN;
+
+#define ERFCF_POLY_NCOEFFS 16
+extern const struct erfcf_poly_data
+{
+  double poly[4][ERFCF_POLY_NCOEFFS];
+} __erfcf_poly_data HIDDEN;
+
+#define V_EXP_TAIL_TABLE_BITS 8
+extern const uint64_t __v_exp_tail_data[1 << V_EXP_TAIL_TABLE_BITS] HIDDEN;
+
+#define V_ERF_NINTS 49
+#define V_ERF_NCOEFFS 10
+extern const struct v_erf_data
+{
+  double shifts[V_ERF_NINTS];
+  double coeffs[V_ERF_NCOEFFS][V_ERF_NINTS];
+} __v_erf_data HIDDEN;
+
+#define V_ERFF_NCOEFFS 7
+extern const struct v_erff_data
+{
+  float coeffs[V_ERFF_NCOEFFS][2];
+} __v_erff_data HIDDEN;
+
+#define ATAN_POLY_NCOEFFS 20
+extern const struct atan_poly_data
+{
+  double poly[ATAN_POLY_NCOEFFS];
+} __atan_poly_data HIDDEN;
+
+#define ATANF_POLY_NCOEFFS 8
+extern const struct atanf_poly_data
+{
+  float poly[ATANF_POLY_NCOEFFS];
+} __atanf_poly_data HIDDEN;
+
+#define ASINHF_NCOEFFS 8
+extern const struct asinhf_data
+{
+  float coeffs[ASINHF_NCOEFFS];
+} __asinhf_data HIDDEN;
+
+#define LOG_TABLE_BITS 7
+#define LOG_POLY_ORDER 6
+#define LOG_POLY1_ORDER 12
+extern const struct log_data
+{
+  double ln2hi;
+  double ln2lo;
+  double poly[LOG_POLY_ORDER - 1]; /* First coefficient is 1.  */
+  double poly1[LOG_POLY1_ORDER - 1];
+  struct
+  {
+    double invc, logc;
+  } tab[1 << LOG_TABLE_BITS];
+#if !HAVE_FAST_FMA
+  struct
+  {
+    double chi, clo;
+  } tab2[1 << LOG_TABLE_BITS];
+#endif
+} __log_data HIDDEN;
+
+#define ASINH_NCOEFFS 18
+extern const struct asinh_data
+{
+  double poly[ASINH_NCOEFFS];
+} __asinh_data HIDDEN;
+
+#define LOG1P_NCOEFFS 19
+extern const struct log1p_data
+{
+  double coeffs[LOG1P_NCOEFFS];
+} __log1p_data HIDDEN;
+
+#define LOG1PF_2U5
+#define V_LOG1PF_2U5
+#define LOG1PF_NCOEFFS 9
+extern const struct log1pf_data
+{
+  float coeffs[LOG1PF_NCOEFFS];
+} __log1pf_data HIDDEN;
+
+#define TANF_P_POLY_NCOEFFS 6
+/* cotan approach needs order 3 on [0, pi/4] to reach <3.5ulps.  */
+#define TANF_Q_POLY_NCOEFFS 4
+extern const struct tanf_poly_data
+{
+  float poly_tan[TANF_P_POLY_NCOEFFS];
+  float poly_cotan[TANF_Q_POLY_NCOEFFS];
+} __tanf_poly_data HIDDEN;
+
+#define V_LOG2F_POLY_NCOEFFS 9
+extern const struct v_log2f_data
+{
+  float poly[V_LOG2F_POLY_NCOEFFS];
+} __v_log2f_data HIDDEN;
+
+#define V_LOG2_TABLE_BITS 7
+#define V_LOG2_POLY_ORDER 6
+extern const struct v_log2_data
+{
+  double poly[V_LOG2_POLY_ORDER - 1];
+  struct
+  {
+    double invc, log2c;
+  } tab[1 << V_LOG2_TABLE_BITS];
+} __v_log2_data HIDDEN;
+
+#define V_SINF_NCOEFFS 4
+extern const struct sv_sinf_data
+{
+  float coeffs[V_SINF_NCOEFFS];
+} __sv_sinf_data HIDDEN;
+
+#define V_LOG10_TABLE_BITS 7
+#define V_LOG10_POLY_ORDER 6
+extern const struct v_log10_data
+{
+  struct
+  {
+    double invc, log10c;
+  } tab[1 << V_LOG10_TABLE_BITS];
+  double poly[V_LOG10_POLY_ORDER - 1];
+  double invln10, log10_2;
+} __v_log10_data HIDDEN;
+
+#define V_LOG10F_POLY_ORDER 9
+extern const float __v_log10f_poly[V_LOG10F_POLY_ORDER - 1] HIDDEN;
+
+#define SV_LOGF_POLY_ORDER 8
+extern const float __sv_logf_poly[SV_LOGF_POLY_ORDER - 1] HIDDEN;
+
+#define SV_LOG_POLY_ORDER 6
+#define SV_LOG_TABLE_BITS 7
+extern const struct sv_log_data
+{
+  double invc[1 << SV_LOG_TABLE_BITS];
+  double logc[1 << SV_LOG_TABLE_BITS];
+  double poly[SV_LOG_POLY_ORDER - 1];
+} __sv_log_data HIDDEN;
+
+#ifndef SV_EXPF_USE_FEXPA
+#define SV_EXPF_USE_FEXPA 0
+#endif
+#define SV_EXPF_POLY_ORDER 6
+extern const float __sv_expf_poly[SV_EXPF_POLY_ORDER - 1] HIDDEN;
+
+#define EXPM1F_POLY_ORDER 5
+extern const float __expm1f_poly[EXPM1F_POLY_ORDER] HIDDEN;
+
+#define EXPF_TABLE_BITS 5
+#define EXPF_POLY_ORDER 3
+extern const struct expf_data
+{
+  uint64_t tab[1 << EXPF_TABLE_BITS];
+  double invln2_scaled;
+  double poly_scaled[EXPF_POLY_ORDER];
+} __expf_data HIDDEN;
+
+#define EXPM1_POLY_ORDER 11
+extern const double __expm1_poly[EXPM1_POLY_ORDER] HIDDEN;
+
+extern const struct cbrtf_data
+{
+  float poly[4];
+  float table[5];
+} __cbrtf_data HIDDEN;
+
+extern const struct cbrt_data
+{
+  double poly[4];
+  double table[5];
+} __cbrt_data HIDDEN;
+
+extern const struct v_tan_data
+{
+  double neg_half_pi_hi, neg_half_pi_lo;
+  double poly[9];
+} __v_tan_data HIDDEN;
+#endif
diff --git a/pl/math/math_err.c b/pl/math/math_err.c
new file mode 100644
index 0000000..d246a89
--- /dev/null
+++ b/pl/math/math_err.c
@@ -0,0 +1,78 @@
+/*
+ * Double-precision math error handling.
+ *
+ * Copyright (c) 2018-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#if WANT_ERRNO
+#include <errno.h>
+/* NOINLINE reduces code size and avoids making math functions non-leaf
+   when the error handling is inlined.  */
+NOINLINE static double
+with_errno (double y, int e)
+{
+  errno = e;
+  return y;
+}
+#else
+#define with_errno(x, e) (x)
+#endif
+
+/* NOINLINE reduces code size.  */
+NOINLINE static double
+xflow (uint32_t sign, double y)
+{
+  y = eval_as_double (opt_barrier_double (sign ? -y : y) * y);
+  return with_errno (y, ERANGE);
+}
+
+HIDDEN double
+__math_uflow (uint32_t sign)
+{
+  return xflow (sign, 0x1p-767);
+}
+
+/* Underflows to zero in some non-nearest rounding mode, setting errno
+   is valid even if the result is non-zero, but in the subnormal range.  */
+HIDDEN double
+__math_may_uflow (uint32_t sign)
+{
+  return xflow (sign, 0x1.8p-538);
+}
+
+HIDDEN double
+__math_oflow (uint32_t sign)
+{
+  return xflow (sign, 0x1p769);
+}
+
+HIDDEN double
+__math_divzero (uint32_t sign)
+{
+  double y = opt_barrier_double (sign ? -1.0 : 1.0) / 0.0;
+  return with_errno (y, ERANGE);
+}
+
+HIDDEN double
+__math_invalid (double x)
+{
+  double y = (x - x) / (x - x);
+  return isnan (x) ? y : with_errno (y, EDOM);
+}
+
+/* Check result and set errno if necessary.  */
+
+HIDDEN double
+__math_check_uflow (double y)
+{
+  return y == 0.0 ? with_errno (y, ERANGE) : y;
+}
+
+HIDDEN double
+__math_check_oflow (double y)
+{
+  return isinf (y) ? with_errno (y, ERANGE) : y;
+}
diff --git a/pl/math/math_errf.c b/pl/math/math_errf.c
new file mode 100644
index 0000000..96271ff
--- /dev/null
+++ b/pl/math/math_errf.c
@@ -0,0 +1,78 @@
+/*
+ * Single-precision math error handling.
+ *
+ * Copyright (c) 2017-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#if WANT_ERRNO
+#include <errno.h>
+/* NOINLINE reduces code size and avoids making math functions non-leaf
+   when the error handling is inlined.  */
+NOINLINE static float
+with_errnof (float y, int e)
+{
+  errno = e;
+  return y;
+}
+#else
+#define with_errnof(x, e) (x)
+#endif
+
+/* NOINLINE reduces code size.  */
+NOINLINE static float
+xflowf (uint32_t sign, float y)
+{
+  y = eval_as_float (opt_barrier_float (sign ? -y : y) * y);
+  return with_errnof (y, ERANGE);
+}
+
+HIDDEN float
+__math_uflowf (uint32_t sign)
+{
+  return xflowf (sign, 0x1p-95f);
+}
+
+/* Underflows to zero in some non-nearest rounding mode, setting errno
+   is valid even if the result is non-zero, but in the subnormal range.  */
+HIDDEN float
+__math_may_uflowf (uint32_t sign)
+{
+  return xflowf (sign, 0x1.4p-75f);
+}
+
+HIDDEN float
+__math_oflowf (uint32_t sign)
+{
+  return xflowf (sign, 0x1p97f);
+}
+
+HIDDEN float
+__math_divzerof (uint32_t sign)
+{
+  float y = opt_barrier_float (sign ? -1.0f : 1.0f) / 0.0f;
+  return with_errnof (y, ERANGE);
+}
+
+HIDDEN float
+__math_invalidf (float x)
+{
+  float y = (x - x) / (x - x);
+  return isnan (x) ? y : with_errnof (y, EDOM);
+}
+
+/* Check result and set errno if necessary.  */
+
+HIDDEN float
+__math_check_uflowf (float y)
+{
+  return y == 0.0f ? with_errnof (y, ERANGE) : y;
+}
+
+HIDDEN float
+__math_check_oflowf (float y)
+{
+  return isinf (y) ? with_errnof (y, ERANGE) : y;
+}
diff --git a/pl/math/pairwise_horner.h b/pl/math/pairwise_horner.h
new file mode 100644
index 0000000..6ad98dc
--- /dev/null
+++ b/pl/math/pairwise_horner.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for double-precision pairwise Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f64
+#else
+#define FMA fma
+#endif
+
+#include "pairwise_horner_wrap.h"
diff --git a/pl/math/pairwise_horner_wrap.h b/pl/math/pairwise_horner_wrap.h
new file mode 100644
index 0000000..e56f059
--- /dev/null
+++ b/pl/math/pairwise_horner_wrap.h
@@ -0,0 +1,48 @@
+/*
+ * Helper macros for pairwise Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+// clang-format off
+#define  PW_HORNER_1_(x, c,     i) FMA(x,  c(i + 1),                       c(i))
+#define  PW_HORNER_3_(x, x2, c, i) FMA(x2, PW_HORNER_1_ (x,     c, i + 2), PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_5_(x, x2, c, i) FMA(x2, PW_HORNER_3_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_7_(x, x2, c, i) FMA(x2, PW_HORNER_5_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_9_(x, x2, c, i) FMA(x2, PW_HORNER_7_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_11_(x, x2, c, i) FMA(x2, PW_HORNER_9_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_13_(x, x2, c, i) FMA(x2, PW_HORNER_11_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_15_(x, x2, c, i) FMA(x2, PW_HORNER_13_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_17_(x, x2, c, i) FMA(x2, PW_HORNER_15_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+
+#define  PAIRWISE_HORNER_1(x,     c) PW_HORNER_1_ (x, c, 0)
+#define  PAIRWISE_HORNER_3(x, x2, c) PW_HORNER_3_ (x, x2, c, 0)
+#define  PAIRWISE_HORNER_5(x, x2, c) PW_HORNER_5_ (x, x2, c, 0)
+#define  PAIRWISE_HORNER_7(x, x2, c) PW_HORNER_7_ (x, x2, c, 0)
+#define  PAIRWISE_HORNER_9(x, x2, c) PW_HORNER_9_ (x, x2, c, 0)
+#define PAIRWISE_HORNER_11(x, x2, c) PW_HORNER_11_(x, x2, c, 0)
+#define PAIRWISE_HORNER_13(x, x2, c) PW_HORNER_13_(x, x2, c, 0)
+#define PAIRWISE_HORNER_15(x, x2, c) PW_HORNER_15_(x, x2, c, 0)
+#define PAIRWISE_HORNER_17(x, x2, c) PW_HORNER_17_(x, x2, c, 0)
+
+#define  PW_HORNER_2_(x, x2, c, i) FMA(x2, c(i + 2),                       PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_4_(x, x2, c, i) FMA(x2, PW_HORNER_2_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_6_(x, x2, c, i) FMA(x2, PW_HORNER_4_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define  PW_HORNER_8_(x, x2, c, i) FMA(x2, PW_HORNER_6_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_10_(x, x2, c, i) FMA(x2, PW_HORNER_8_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_12_(x, x2, c, i) FMA(x2, PW_HORNER_10_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_14_(x, x2, c, i) FMA(x2, PW_HORNER_12_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_16_(x, x2, c, i) FMA(x2, PW_HORNER_14_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+#define PW_HORNER_18_(x, x2, c, i) FMA(x2, PW_HORNER_16_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
+
+#define  PAIRWISE_HORNER_2(x, x2, c) PW_HORNER_2_ (x, x2, c, 0)
+#define  PAIRWISE_HORNER_4(x, x2, c) PW_HORNER_4_ (x, x2, c, 0)
+#define  PAIRWISE_HORNER_6(x, x2, c) PW_HORNER_6_ (x, x2, c, 0)
+#define  PAIRWISE_HORNER_8(x, x2, c) PW_HORNER_8_(x, x2, c, 0)
+#define PAIRWISE_HORNER_10(x, x2, c) PW_HORNER_10_(x, x2, c, 0)
+#define PAIRWISE_HORNER_12(x, x2, c) PW_HORNER_12_(x, x2, c, 0)
+#define PAIRWISE_HORNER_14(x, x2, c) PW_HORNER_14_(x, x2, c, 0)
+#define PAIRWISE_HORNER_16(x, x2, c) PW_HORNER_16_(x, x2, c, 0)
+#define PAIRWISE_HORNER_18(x, x2, c) PW_HORNER_18_(x, x2, c, 0)
+// clang-format on
diff --git a/pl/math/pairwise_hornerf.h b/pl/math/pairwise_hornerf.h
new file mode 100644
index 0000000..784750c
--- /dev/null
+++ b/pl/math/pairwise_hornerf.h
@@ -0,0 +1,14 @@
+/*
+ * Helper macros for single-precision pairwise Horner polynomial evaluation.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#if V_SUPPORTED
+#define FMA v_fma_f32
+#else
+#define FMA fmaf
+#endif
+
+#include "pairwise_horner_wrap.h"
diff --git a/pl/math/pl_sig.h b/pl/math/pl_sig.h
new file mode 100644
index 0000000..686d24f
--- /dev/null
+++ b/pl/math/pl_sig.h
@@ -0,0 +1,43 @@
+/*
+ * PL macros for emitting various ulp/bench entries based on function signature
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception.
+ */
+#define PL_DECL_SF1(fun) float fun##f (float);
+#define PL_DECL_SF2(fun) float fun##f (float, float);
+#define PL_DECL_SD1(fun) double fun (double);
+#define PL_DECL_SD2(fun) double fun (double, double);
+
+#if V_SUPPORTED
+#define PL_DECL_VF1(fun) VPCS_ATTR v_f32_t V_NAME (fun##f) (v_f32_t);
+#define PL_DECL_VF2(fun) VPCS_ATTR v_f32_t V_NAME (fun##f) (v_f32_t, v_f32_t);
+#define PL_DECL_VD1(fun) VPCS_ATTR v_f64_t V_NAME (fun) (v_f64_t);
+#define PL_DECL_VD2(fun) VPCS_ATTR v_f64_t V_NAME (fun) (v_f64_t, v_f64_t);
+#else
+#define PL_DECL_VF1(fun)
+#define PL_DECL_VF2(fun)
+#define PL_DECL_VD1(fun)
+#define PL_DECL_VD2(fun)
+#endif
+
+#if SV_SUPPORTED
+#define PL_DECL_SVF1(fun) sv_f32_t __sv_##fun##f_x (sv_f32_t, svbool_t);
+#define PL_DECL_SVF2(fun)                                                      \
+  sv_f32_t __sv_##fun##f_x (sv_f32_t, sv_f32_t, svbool_t);
+#define PL_DECL_SVD1(fun) sv_f64_t __sv_##fun##_x (sv_f64_t, svbool_t);
+#define PL_DECL_SVD2(fun)                                                      \
+  sv_f64_t __sv_##fun##_x (sv_f64_t, sv_f64_t, svbool_t);
+#else
+#define PL_DECL_SVF1(fun)
+#define PL_DECL_SVF2(fun)
+#define PL_DECL_SVD1(fun)
+#define PL_DECL_SVD2(fun)
+#endif
+
+/* For building the routines, emit function prototype from PL_SIG. This
+   ensures that the correct signature has been chosen (wrong one will be a
+   compile error). PL_SIG is defined differently by various components of the
+   build system to emit entries in the wrappers and entries for mathbench and
+   ulp.  */
+#define PL_SIG(v, t, a, f, ...) PL_DECL_##v##t##a (f)
diff --git a/pl/math/s_acosh_3u5.c b/pl/math/s_acosh_3u5.c
new file mode 100644
index 0000000..f62cbd6
--- /dev/null
+++ b/pl/math/s_acosh_3u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_acosh_3u5.c"
diff --git a/pl/math/s_acoshf_3u1.c b/pl/math/s_acoshf_3u1.c
new file mode 100644
index 0000000..3740666
--- /dev/null
+++ b/pl/math/s_acoshf_3u1.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_acoshf_3u1.c"
diff --git a/pl/math/s_asinh_3u5.c b/pl/math/s_asinh_3u5.c
new file mode 100644
index 0000000..ab8fbd9
--- /dev/null
+++ b/pl/math/s_asinh_3u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_asinh_3u5.c"
diff --git a/pl/math/s_asinhf_2u7.c b/pl/math/s_asinhf_2u7.c
new file mode 100644
index 0000000..13e1a5f
--- /dev/null
+++ b/pl/math/s_asinhf_2u7.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_asinhf_2u7.c"
diff --git a/pl/math/s_atan2_3u.c b/pl/math/s_atan2_3u.c
new file mode 100644
index 0000000..4603e5f
--- /dev/null
+++ b/pl/math/s_atan2_3u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atan2_3u.c"
diff --git a/pl/math/s_atan2f_3u.c b/pl/math/s_atan2f_3u.c
new file mode 100644
index 0000000..894d843
--- /dev/null
+++ b/pl/math/s_atan2f_3u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atan2f_3u.c"
diff --git a/pl/math/s_atan_2u5.c b/pl/math/s_atan_2u5.c
new file mode 100644
index 0000000..4b61bc4
--- /dev/null
+++ b/pl/math/s_atan_2u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atan_2u5.c"
diff --git a/pl/math/s_atanf_3u.c b/pl/math/s_atanf_3u.c
new file mode 100644
index 0000000..6b65719
--- /dev/null
+++ b/pl/math/s_atanf_3u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atanf_3u.c"
diff --git a/pl/math/s_atanh_3u5.c b/pl/math/s_atanh_3u5.c
new file mode 100644
index 0000000..f6a5f75
--- /dev/null
+++ b/pl/math/s_atanh_3u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atanh_3u5.c"
diff --git a/pl/math/s_atanhf_3u1.c b/pl/math/s_atanhf_3u1.c
new file mode 100644
index 0000000..e7e5c61
--- /dev/null
+++ b/pl/math/s_atanhf_3u1.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atanhf_3u1.c"
diff --git a/pl/math/s_cbrt_2u.c b/pl/math/s_cbrt_2u.c
new file mode 100644
index 0000000..435e74a
--- /dev/null
+++ b/pl/math/s_cbrt_2u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_cbrt_2u.c"
diff --git a/pl/math/s_cbrtf_1u5.c b/pl/math/s_cbrtf_1u5.c
new file mode 100644
index 0000000..5c79370
--- /dev/null
+++ b/pl/math/s_cbrtf_1u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_cbrtf_1u5.c"
diff --git a/pl/math/s_cosh_2u.c b/pl/math/s_cosh_2u.c
new file mode 100644
index 0000000..cdf352c
--- /dev/null
+++ b/pl/math/s_cosh_2u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_cosh_2u.c"
diff --git a/pl/math/s_coshf_2u4.c b/pl/math/s_coshf_2u4.c
new file mode 100644
index 0000000..8f7d5da
--- /dev/null
+++ b/pl/math/s_coshf_2u4.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_coshf_2u4.c"
diff --git a/pl/math/s_erf_2u.c b/pl/math/s_erf_2u.c
new file mode 100644
index 0000000..839535c
--- /dev/null
+++ b/pl/math/s_erf_2u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_erf_2u.c"
diff --git a/pl/math/s_erfc_4u.c b/pl/math/s_erfc_4u.c
new file mode 100644
index 0000000..bf9e3e6
--- /dev/null
+++ b/pl/math/s_erfc_4u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_erfc_4u.c"
diff --git a/pl/math/s_erfcf_1u.c b/pl/math/s_erfcf_1u.c
new file mode 100644
index 0000000..024d224
--- /dev/null
+++ b/pl/math/s_erfcf_1u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_erfcf_1u.c"
diff --git a/pl/math/s_erff_1u5.c b/pl/math/s_erff_1u5.c
new file mode 100644
index 0000000..a5b9bf9
--- /dev/null
+++ b/pl/math/s_erff_1u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_erff_1u5.c"
diff --git a/pl/math/s_exp_tail.c b/pl/math/s_exp_tail.c
new file mode 100644
index 0000000..20b1b41
--- /dev/null
+++ b/pl/math/s_exp_tail.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_exp_tail.c"
diff --git a/pl/math/s_expf.c b/pl/math/s_expf.c
new file mode 100644
index 0000000..557a2e3
--- /dev/null
+++ b/pl/math/s_expf.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_expf.c"
diff --git a/pl/math/s_expm1_2u5.c b/pl/math/s_expm1_2u5.c
new file mode 100644
index 0000000..da2d6e7
--- /dev/null
+++ b/pl/math/s_expm1_2u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_expm1_2u5.c"
diff --git a/pl/math/s_expm1f_1u6.c b/pl/math/s_expm1f_1u6.c
new file mode 100644
index 0000000..eea8089
--- /dev/null
+++ b/pl/math/s_expm1f_1u6.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_expm1f_1u6.c"
diff --git a/pl/math/s_log10_2u5.c b/pl/math/s_log10_2u5.c
new file mode 100644
index 0000000..2480e5a
--- /dev/null
+++ b/pl/math/s_log10_2u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_log10_2u5.c"
diff --git a/pl/math/s_log10f_3u5.c b/pl/math/s_log10f_3u5.c
new file mode 100644
index 0000000..173e0fd
--- /dev/null
+++ b/pl/math/s_log10f_3u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_log10f_3u5.c"
diff --git a/pl/math/s_log1p_2u5.c b/pl/math/s_log1p_2u5.c
new file mode 100644
index 0000000..20b395a
--- /dev/null
+++ b/pl/math/s_log1p_2u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_log1p_2u5.c"
diff --git a/pl/math/s_log1pf_2u1.c b/pl/math/s_log1pf_2u1.c
new file mode 100644
index 0000000..013ec4c
--- /dev/null
+++ b/pl/math/s_log1pf_2u1.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_log1pf_2u1.c"
diff --git a/pl/math/s_log2_3u.c b/pl/math/s_log2_3u.c
new file mode 100644
index 0000000..d46f3f9
--- /dev/null
+++ b/pl/math/s_log2_3u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_log2_3u.c"
diff --git a/pl/math/s_log2f_2u5.c b/pl/math/s_log2f_2u5.c
new file mode 100644
index 0000000..e76c67d
--- /dev/null
+++ b/pl/math/s_log2f_2u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_log2f_2u5.c"
diff --git a/pl/math/s_sinh_3u.c b/pl/math/s_sinh_3u.c
new file mode 100644
index 0000000..27e5e65
--- /dev/null
+++ b/pl/math/s_sinh_3u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_sinh_3u.c"
diff --git a/pl/math/s_sinhf_2u3.c b/pl/math/s_sinhf_2u3.c
new file mode 100644
index 0000000..607f942
--- /dev/null
+++ b/pl/math/s_sinhf_2u3.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_sinhf_2u3.c"
diff --git a/pl/math/s_tan_3u5.c b/pl/math/s_tan_3u5.c
new file mode 100644
index 0000000..adb807c
--- /dev/null
+++ b/pl/math/s_tan_3u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_tan_3u5.c"
diff --git a/pl/math/s_tanf_3u5.c b/pl/math/s_tanf_3u5.c
new file mode 100644
index 0000000..fa64c8a
--- /dev/null
+++ b/pl/math/s_tanf_3u5.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_tanf_3u5.c"
diff --git a/pl/math/s_tanh_3u.c b/pl/math/s_tanh_3u.c
new file mode 100644
index 0000000..a4d7bce
--- /dev/null
+++ b/pl/math/s_tanh_3u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_tanh_3u.c"
diff --git a/pl/math/s_tanhf_2u6.c b/pl/math/s_tanhf_2u6.c
new file mode 100644
index 0000000..896fc62
--- /dev/null
+++ b/pl/math/s_tanhf_2u6.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_tanhf_2u6.c"
diff --git a/pl/math/sinh_3u.c b/pl/math/sinh_3u.c
new file mode 100644
index 0000000..f534815
--- /dev/null
+++ b/pl/math/sinh_3u.c
@@ -0,0 +1,66 @@
+/*
+ * Double-precision sinh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define Half 0x3fe0000000000000
+#define OFlowBound                                                             \
+  0x40862e42fefa39f0 /* 0x1.62e42fefa39fp+9, above which using expm1 results   \
+			in NaN.  */
+
+double
+__exp_dd (double, double);
+
+/* Approximation for double-precision sinh(x) using expm1.
+   sinh(x) = (exp(x) - exp(-x)) / 2.
+   The greatest observed error is 2.57 ULP:
+   __v_sinh(0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2
+				 want 0x1.ab34e59d678d9p-2.  */
+double
+sinh (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t iax = ix & AbsMask;
+  double ax = asdouble (iax);
+  uint64_t sign = ix & ~AbsMask;
+  double halfsign = asdouble (Half | sign);
+
+  if (unlikely (iax >= OFlowBound))
+    {
+      /* Special values and overflow.  */
+      if (unlikely (iax > 0x7ff0000000000000))
+	return __math_invalidf (x);
+      /* expm1 overflows a little before sinh. We have to fill this
+	 gap by using a different algorithm, in this case we use a
+	 double-precision exp helper. For large x sinh(x) is dominated
+	 by exp(x), however we cannot compute exp without overflow
+	 either. We use the identity: exp(a) = (exp(a / 2)) ^ 2
+	 to compute sinh(x) ~= (exp(|x| / 2)) ^ 2 / 2    for x > 0
+			    ~= (exp(|x| / 2)) ^ 2 / -2   for x < 0.  */
+      double e = __exp_dd (ax / 2, 0);
+      return (e * halfsign) * e;
+    }
+
+  /* Use expm1f to retain acceptable precision for small numbers.
+     Let t = e^(|x|) - 1.  */
+  double t = expm1 (ax);
+  /* Then sinh(x) = (t + t / (t + 1)) / 2   for x > 0
+		    (t + t / (t + 1)) / -2  for x < 0.  */
+  return (t + t / (t + 1)) * halfsign;
+}
+
+PL_SIG (S, D, 1, sinh, -10.0, 10.0)
+PL_TEST_ULP (sinh, 2.08)
+PL_TEST_INTERVAL (sinh, 0, 0x1p-51, 100)
+PL_TEST_INTERVAL (sinh, -0, -0x1p-51, 100)
+PL_TEST_INTERVAL (sinh, 0x1p-51, 0x1.62e42fefa39fp+9, 100000)
+PL_TEST_INTERVAL (sinh, -0x1p-51, -0x1.62e42fefa39fp+9, 100000)
+PL_TEST_INTERVAL (sinh, 0x1.62e42fefa39fp+9, inf, 1000)
+PL_TEST_INTERVAL (sinh, -0x1.62e42fefa39fp+9, -inf, 1000)
diff --git a/pl/math/sinhf_2u3.c b/pl/math/sinhf_2u3.c
new file mode 100644
index 0000000..de94428
--- /dev/null
+++ b/pl/math/sinhf_2u3.c
@@ -0,0 +1,76 @@
+/*
+ * Single-precision sinh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define Expm1OFlowLimit                                                        \
+  0x42b17218 /* 0x1.62e43p+6, 2^7*ln2, minimum value for which expm1f          \
+		overflows.  */
+#define OFlowLimit                                                             \
+  0x42b2d4fd /* 0x1.65a9fap+6, minimum positive value for which sinhf should   \
+		overflow.  */
+
+float
+optr_aor_exp_f32 (float);
+
+/* Approximation for single-precision sinh(x) using expm1.
+   sinh(x) = (exp(x) - exp(-x)) / 2.
+   The maximum error is 2.26 ULP:
+   sinhf(0x1.e34a9ep-4) got 0x1.e469ep-4 want 0x1.e469e4p-4.  */
+float
+sinhf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t iax = ix & AbsMask;
+  float ax = asfloat (iax);
+  uint32_t sign = ix & ~AbsMask;
+  float halfsign = asfloat (Half | sign);
+
+  if (unlikely (iax >= Expm1OFlowLimit))
+    {
+      /* Special values and overflow.  */
+      if (iax >= 0x7fc00001 || iax == 0x7f800000)
+	return x;
+      if (iax >= 0x7f800000)
+	return __math_invalidf (x);
+      if (iax >= OFlowLimit)
+	return __math_oflowf (sign);
+
+      /* expm1f overflows a little before sinhf, (~88.7 vs ~89.4). We have to
+	 fill this gap by using a different algorithm, in this case we use a
+	 double-precision exp helper. For large x sinh(x) dominated by exp(x),
+	 however we cannot compute exp without overflow either. We use the
+	 identity:
+	 exp(a) = (exp(a / 2)) ^ 2.
+	 to compute sinh(x) ~= (exp(|x| / 2)) ^ 2 / 2    for x > 0
+			    ~= (exp(|x| / 2)) ^ 2 / -2   for x < 0.
+	 Greatest error in this region is 1.89 ULP:
+	 sinhf(0x1.65898cp+6) got 0x1.f00aep+127  want 0x1.f00adcp+127.  */
+      float e = optr_aor_exp_f32 (ax / 2);
+      return (e * halfsign) * e;
+    }
+
+  /* Use expm1f to retain acceptable precision for small numbers.
+     Let t = e^(|x|) - 1.  */
+  float t = expm1f (ax);
+  /* Then sinh(x) = (t + t / (t + 1)) / 2   for x > 0
+		    (t + t / (t + 1)) / -2  for x < 0.  */
+  return (t + t / (t + 1)) * halfsign;
+}
+
+PL_SIG (S, F, 1, sinh, -10.0, 10.0)
+PL_TEST_ULP (sinhf, 1.76)
+PL_TEST_INTERVAL (sinhf, 0, 0x1.62e43p+6, 100000)
+PL_TEST_INTERVAL (sinhf, -0, -0x1.62e43p+6, 100000)
+PL_TEST_INTERVAL (sinhf, 0x1.62e43p+6, 0x1.65a9fap+6, 100)
+PL_TEST_INTERVAL (sinhf, -0x1.62e43p+6, -0x1.65a9fap+6, 100)
+PL_TEST_INTERVAL (sinhf, 0x1.65a9fap+6, inf, 100)
+PL_TEST_INTERVAL (sinhf, -0x1.65a9fap+6, -inf, 100)
diff --git a/pl/math/sv_atan2_2u5.c b/pl/math/sv_atan2_2u5.c
new file mode 100644
index 0000000..a4bea1d
--- /dev/null
+++ b/pl/math/sv_atan2_2u5.c
@@ -0,0 +1,93 @@
+/*
+ * Double-precision vector atan2(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#include "sv_atan_common.h"
+
+/* Useful constants.  */
+#define PiOver2 sv_f64 (0x1.921fb54442d18p+0)
+#define SignMask sv_u64 (0x8000000000000000)
+
+/* Special cases i.e. 0, infinity, nan (fall back to scalar calls).  */
+__attribute__ ((noinline)) static sv_f64_t
+specialcase (sv_f64_t y, sv_f64_t x, sv_f64_t ret, const svbool_t cmp)
+{
+  return sv_call2_f64 (atan2, y, x, ret, cmp);
+}
+
+/* Returns a predicate indicating true if the input is the bit representation of
+   0, infinity or nan.  */
+static inline svbool_t
+zeroinfnan (sv_u64_t i, const svbool_t pg)
+{
+  return svcmpge_u64 (pg, svsub_n_u64_x (pg, svlsl_n_u64_x (pg, i, 1), 1),
+		      sv_u64 (2 * asuint64 (INFINITY) - 1));
+}
+
+/* Fast implementation of SVE atan2. Errors are greatest when y and
+   x are reasonably close together. The greatest observed error is 2.28 ULP:
+   sv_atan2(-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732)
+   got -0x1.954f42f1fa841p-1 want -0x1.954f42f1fa843p-1.  */
+sv_f64_t
+__sv_atan2_x (sv_f64_t y, sv_f64_t x, const svbool_t pg)
+{
+  sv_u64_t ix = sv_as_u64_f64 (x);
+  sv_u64_t iy = sv_as_u64_f64 (y);
+
+  svbool_t cmp_x = zeroinfnan (ix, pg);
+  svbool_t cmp_y = zeroinfnan (iy, pg);
+  svbool_t cmp_xy = svorr_b_z (pg, cmp_x, cmp_y);
+
+  sv_u64_t sign_x = svand_u64_x (pg, ix, SignMask);
+  sv_u64_t sign_y = svand_u64_x (pg, iy, SignMask);
+  sv_u64_t sign_xy = sveor_u64_x (pg, sign_x, sign_y);
+
+  sv_f64_t ax = svabs_f64_x (pg, x);
+  sv_f64_t ay = svabs_f64_x (pg, y);
+
+  svbool_t pred_xlt0 = svcmplt_f64 (pg, x, sv_f64 (0.0));
+  svbool_t pred_aygtax = svcmpgt_f64 (pg, ay, ax);
+
+  /* Set up z for call to atan.  */
+  sv_f64_t n = svsel_f64 (pred_aygtax, svneg_f64_x (pg, ax), ay);
+  sv_f64_t d = svsel_f64 (pred_aygtax, ay, ax);
+  sv_f64_t z = svdiv_f64_x (pg, n, d);
+
+  /* Work out the correct shift.  */
+  sv_f64_t shift = svsel_f64 (pred_xlt0, sv_f64 (-2.0), sv_f64 (0.0));
+  shift = svsel_f64 (pred_aygtax, svadd_n_f64_x (pg, shift, 1.0), shift);
+  shift = svmul_f64_x (pg, shift, PiOver2);
+
+  sv_f64_t ret = __sv_atan_common (pg, pg, z, z, shift);
+
+  /* Account for the sign of x and y.  */
+  ret = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (ret), sign_xy));
+
+  if (unlikely (svptest_any (pg, cmp_xy)))
+    {
+      return specialcase (y, x, ret, cmp_xy);
+    }
+
+  return ret;
+}
+
+PL_ALIAS (__sv_atan2_x, _ZGVsMxvv_atan2)
+
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
+PL_SIG (SV, D, 2, atan2)
+PL_TEST_ULP (__sv_atan2, 1.78)
+PL_TEST_INTERVAL (__sv_atan2, -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (__sv_atan2, -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (__sv_atan2, 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (__sv_atan2, 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (__sv_atan2, 1e6, 1e32, 40000)
+#endif
diff --git a/pl/math/sv_atan2f_3u.c b/pl/math/sv_atan2f_3u.c
new file mode 100644
index 0000000..f7674c4
--- /dev/null
+++ b/pl/math/sv_atan2f_3u.c
@@ -0,0 +1,94 @@
+/*
+ * Single-precision vector atan2f(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#include "sv_atanf_common.h"
+
+/* Useful constants.  */
+#define PiOver2 sv_f32 (0x1.921fb6p+0f)
+#define SignMask sv_u32 (0x80000000)
+
+/* Special cases i.e. 0, infinity, nan (fall back to scalar calls).  */
+static inline sv_f32_t
+specialcase (sv_f32_t y, sv_f32_t x, sv_f32_t ret, const svbool_t cmp)
+{
+  return sv_call2_f32 (atan2f, y, x, ret, cmp);
+}
+
+/* Returns a predicate indicating true if the input is the bit representation of
+   0, infinity or nan.  */
+static inline svbool_t
+zeroinfnan (sv_u32_t i, const svbool_t pg)
+{
+  return svcmpge_u32 (pg, svsub_n_u32_x (pg, svlsl_n_u32_x (pg, i, 1), 1),
+		      sv_u32 (2 * 0x7f800000lu - 1));
+}
+
+/* Fast implementation of SVE atan2f based on atan(x) ~ shift + z + z^3 * P(z^2)
+   with reduction to [0,1] using z=1/x and shift = pi/2.
+   Maximum observed error is 2.95 ULP:
+   __sv_atan2f(0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
+					   want 0x1.967f00p-1.  */
+sv_f32_t
+__sv_atan2f_x (sv_f32_t y, sv_f32_t x, const svbool_t pg)
+{
+  sv_u32_t ix = sv_as_u32_f32 (x);
+  sv_u32_t iy = sv_as_u32_f32 (y);
+
+  svbool_t cmp_x = zeroinfnan (ix, pg);
+  svbool_t cmp_y = zeroinfnan (iy, pg);
+  svbool_t cmp_xy = svorr_b_z (pg, cmp_x, cmp_y);
+
+  sv_u32_t sign_x = svand_u32_x (pg, ix, SignMask);
+  sv_u32_t sign_y = svand_u32_x (pg, iy, SignMask);
+  sv_u32_t sign_xy = sveor_u32_x (pg, sign_x, sign_y);
+
+  sv_f32_t ax = svabs_f32_x (pg, x);
+  sv_f32_t ay = svabs_f32_x (pg, y);
+
+  svbool_t pred_xlt0 = svcmplt_f32 (pg, x, sv_f32 (0.0));
+  svbool_t pred_aygtax = svcmpgt_f32 (pg, ay, ax);
+
+  /* Set up z for call to atan.  */
+  sv_f32_t n = svsel_f32 (pred_aygtax, svneg_f32_x (pg, ax), ay);
+  sv_f32_t d = svsel_f32 (pred_aygtax, ay, ax);
+  sv_f32_t z = svdiv_f32_x (pg, n, d);
+
+  /* Work out the correct shift.  */
+  sv_f32_t shift = svsel_f32 (pred_xlt0, sv_f32 (-2.0), sv_f32 (0.0));
+  shift = svsel_f32 (pred_aygtax, svadd_n_f32_x (pg, shift, 1.0), shift);
+  shift = svmul_f32_x (pg, shift, PiOver2);
+
+  sv_f32_t ret = __sv_atanf_common (pg, pg, z, z, shift);
+
+  /* Account for the sign of x and y.  */
+  ret = sv_as_f32_u32 (sveor_u32_x (pg, sv_as_u32_f32 (ret), sign_xy));
+
+  if (unlikely (svptest_any (pg, cmp_xy)))
+    {
+      return specialcase (y, x, ret, cmp_xy);
+    }
+
+  return ret;
+}
+
+PL_ALIAS (__sv_atan2f_x, _ZGVsMxvv_atan2f)
+
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
+PL_SIG (SV, F, 2, atan2)
+PL_TEST_ULP (__sv_atan2f, 2.45)
+PL_TEST_INTERVAL (__sv_atan2f, -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (__sv_atan2f, -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (__sv_atan2f, 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (__sv_atan2f, 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (__sv_atan2f, 1e6, 1e32, 40000)
+#endif
diff --git a/pl/math/sv_atan_2u5.c b/pl/math/sv_atan_2u5.c
new file mode 100644
index 0000000..02ac331
--- /dev/null
+++ b/pl/math/sv_atan_2u5.c
@@ -0,0 +1,62 @@
+/*
+ * Double-precision vector atan(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#include "sv_atan_common.h"
+
+/* Useful constants.  */
+#define PiOver2 sv_f64 (0x1.921fb54442d18p+0)
+#define AbsMask (0x7fffffffffffffff)
+
+/* Fast implementation of SVE atan.
+   Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
+   z=1/x and shift = pi/2. Largest errors are close to 1. The maximum observed
+   error is 2.27 ulps:
+   __sv_atan(0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
+				  want 0x1.9225645bdd7c3p-1.  */
+sv_f64_t
+__sv_atan_x (sv_f64_t x, const svbool_t pg)
+{
+  /* No need to trigger special case. Small cases, infs and nans
+     are supported by our approximation technique.  */
+  sv_u64_t ix = sv_as_u64_f64 (x);
+  sv_u64_t sign = svand_n_u64_x (pg, ix, ~AbsMask);
+
+  /* Argument reduction:
+     y := arctan(x) for x < 1
+     y := pi/2 + arctan(-1/x) for x > 1
+     Hence, use z=-1/a if x>=1, otherwise z=a.  */
+  svbool_t red = svacgt_n_f64 (pg, x, 1.0);
+  /* Avoid dependency in abs(x) in division (and comparison).  */
+  sv_f64_t z = svsel_f64 (red, svdiv_f64_x (pg, sv_f64 (-1.0), x), x);
+  /* Use absolute value only when needed (odd powers of z).  */
+  sv_f64_t az = svabs_f64_x (pg, z);
+  az = svneg_f64_m (az, red, az);
+
+  sv_f64_t y = __sv_atan_common (pg, red, z, az, PiOver2);
+
+  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
+  y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign));
+
+  return y;
+}
+
+PL_ALIAS (__sv_atan_x, _ZGVsMxv_atan)
+
+PL_SIG (SV, D, 1, atan, -3.1, 3.1)
+PL_TEST_ULP (__sv_atan, 1.78)
+PL_TEST_INTERVAL (__sv_atan, -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (__sv_atan, -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (__sv_atan, 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (__sv_atan, 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (__sv_atan, 1e6, 1e32, 40000)
+#endif
diff --git a/pl/math/sv_atan_common.h b/pl/math/sv_atan_common.h
new file mode 100644
index 0000000..bfe6998
--- /dev/null
+++ b/pl/math/sv_atan_common.h
@@ -0,0 +1,61 @@
+/*
+ * Double-precision polynomial evaluation function for SVE atan(x) and
+ * atan2(y,x).
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "sv_math.h"
+
+#define P(i) sv_f64 (__atan_poly_data.poly[i])
+
+/* Polynomial used in fast SVE atan(x) and atan2(y,x) implementations
+   The order 19 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2).  */
+static inline sv_f64_t
+__sv_atan_common (svbool_t pg, svbool_t red, sv_f64_t z, sv_f64_t az,
+		  sv_f64_t shift)
+{
+  /* Use full Estrin scheme for P(z^2) with deg(P)=19.  */
+  sv_f64_t z2 = svmul_f64_x (pg, z, z);
+
+  /* Level 1.  */
+  sv_f64_t P_1_0 = sv_fma_f64_x (pg, P (1), z2, P (0));
+  sv_f64_t P_3_2 = sv_fma_f64_x (pg, P (3), z2, P (2));
+  sv_f64_t P_5_4 = sv_fma_f64_x (pg, P (5), z2, P (4));
+  sv_f64_t P_7_6 = sv_fma_f64_x (pg, P (7), z2, P (6));
+  sv_f64_t P_9_8 = sv_fma_f64_x (pg, P (9), z2, P (8));
+  sv_f64_t P_11_10 = sv_fma_f64_x (pg, P (11), z2, P (10));
+  sv_f64_t P_13_12 = sv_fma_f64_x (pg, P (13), z2, P (12));
+  sv_f64_t P_15_14 = sv_fma_f64_x (pg, P (15), z2, P (14));
+  sv_f64_t P_17_16 = sv_fma_f64_x (pg, P (17), z2, P (16));
+  sv_f64_t P_19_18 = sv_fma_f64_x (pg, P (19), z2, P (18));
+
+  /* Level 2.  */
+  sv_f64_t x2 = svmul_f64_x (pg, z2, z2);
+  sv_f64_t P_3_0 = sv_fma_f64_x (pg, P_3_2, x2, P_1_0);
+  sv_f64_t P_7_4 = sv_fma_f64_x (pg, P_7_6, x2, P_5_4);
+  sv_f64_t P_11_8 = sv_fma_f64_x (pg, P_11_10, x2, P_9_8);
+  sv_f64_t P_15_12 = sv_fma_f64_x (pg, P_15_14, x2, P_13_12);
+  sv_f64_t P_19_16 = sv_fma_f64_x (pg, P_19_18, x2, P_17_16);
+
+  /* Level 3.  */
+  sv_f64_t x4 = svmul_f64_x (pg, x2, x2);
+  sv_f64_t P_7_0 = sv_fma_f64_x (pg, P_7_4, x4, P_3_0);
+  sv_f64_t P_15_8 = sv_fma_f64_x (pg, P_15_12, x4, P_11_8);
+
+  /* Level 4.  */
+  sv_f64_t x8 = svmul_f64_x (pg, x4, x4);
+  sv_f64_t y = sv_fma_f64_x (pg, P_19_16, x8, P_15_8);
+  y = sv_fma_f64_x (pg, y, x8, P_7_0);
+
+  /* Finalize. y = shift + z + z^3 * P(z^2).  */
+  sv_f64_t z3 = svmul_f64_x (pg, z2, az);
+  y = sv_fma_f64_x (pg, y, z3, az);
+
+  /* Apply shift as indicated by `red` predicate.  */
+  y = svadd_f64_m (red, y, shift);
+
+  return y;
+}
diff --git a/pl/math/sv_atanf_2u9.c b/pl/math/sv_atanf_2u9.c
new file mode 100644
index 0000000..8d38e42
--- /dev/null
+++ b/pl/math/sv_atanf_2u9.c
@@ -0,0 +1,59 @@
+/*
+ * Single-precision vector atan(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#include "sv_atanf_common.h"
+
+#define PiOver2 sv_f32 (0x1.921fb6p+0f)
+#define AbsMask (0x7fffffff)
+
+/* Fast implementation of SVE atanf based on
+   atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
+   z=-1/x and shift = pi/2.
+   Largest observed error is 2.9 ULP, close to +/-1.0:
+   __sv_atanf(0x1.0468f6p+0) got -0x1.967f06p-1
+			    want -0x1.967fp-1.  */
+sv_f32_t
+__sv_atanf_x (sv_f32_t x, const svbool_t pg)
+{
+  /* No need to trigger special case. Small cases, infs and nans
+     are supported by our approximation technique.  */
+  sv_u32_t ix = sv_as_u32_f32 (x);
+  sv_u32_t sign = svand_n_u32_x (pg, ix, ~AbsMask);
+
+  /* Argument reduction:
+     y := arctan(x) for x < 1
+     y := pi/2 + arctan(-1/x) for x > 1
+     Hence, use z=-1/a if x>=1, otherwise z=a.  */
+  svbool_t red = svacgt_n_f32 (pg, x, 1.0f);
+  /* Avoid dependency in abs(x) in division (and comparison).  */
+  sv_f32_t z = svsel_f32 (red, svdiv_f32_x (pg, sv_f32 (-1.0f), x), x);
+  /* Use absolute value only when needed (odd powers of z).  */
+  sv_f32_t az = svabs_f32_x (pg, z);
+  az = svneg_f32_m (az, red, az);
+
+  sv_f32_t y = __sv_atanf_common (pg, red, z, az, PiOver2);
+
+  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
+  return sv_as_f32_u32 (sveor_u32_x (pg, sv_as_u32_f32 (y), sign));
+}
+
+PL_ALIAS (__sv_atanf_x, _ZGVsMxv_atanf)
+
+PL_SIG (SV, F, 1, atan, -3.1, 3.1)
+PL_TEST_ULP (__sv_atanf, 2.9)
+PL_TEST_INTERVAL (__sv_atanf, -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (__sv_atanf, -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (__sv_atanf, 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (__sv_atanf, 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (__sv_atanf, 1e6, 1e32, 40000)
+#endif
diff --git a/pl/math/sv_atanf_common.h b/pl/math/sv_atanf_common.h
new file mode 100644
index 0000000..dc45eff
--- /dev/null
+++ b/pl/math/sv_atanf_common.h
@@ -0,0 +1,47 @@
+/*
+ * Single-precision polynomial evaluation function for SVE atan(x) and
+ * atan2(y,x).
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_SV_ATANF_COMMON_H
+#define PL_MATH_SV_ATANF_COMMON_H
+
+#include "math_config.h"
+#include "sv_math.h"
+
+#define P(i) sv_f32 (__atanf_poly_data.poly[i])
+
+/* Polynomial used in fast SVE atanf(x) and atan2f(y,x) implementations
+   The order 7 polynomial P approximates (f(sqrt(x))-sqrt(x))/x^(3/2).  */
+static inline sv_f32_t
+__sv_atanf_common (svbool_t pg, svbool_t red, sv_f32_t z, sv_f32_t az,
+		   sv_f32_t shift)
+{
+  /* Use full Estrin scheme for P(z^2) with deg(P)=7.  */
+
+  /* First compute square powers of z.  */
+  sv_f32_t z2 = svmul_f32_x (pg, z, z);
+  sv_f32_t z4 = svmul_f32_x (pg, z2, z2);
+  sv_f32_t z8 = svmul_f32_x (pg, z4, z4);
+
+  /* Then assemble polynomial.  */
+  sv_f32_t p_4_7 = sv_fma_f32_x (pg, z4, (sv_fma_f32_x (pg, z2, P (7), P (6))),
+				 (sv_fma_f32_x (pg, z2, P (5), P (4))));
+  sv_f32_t p_0_3 = sv_fma_f32_x (pg, z4, (sv_fma_f32_x (pg, z2, P (3), P (2))),
+				 (sv_fma_f32_x (pg, z2, P (1), P (0))));
+  sv_f32_t y = sv_fma_f32_x (pg, z8, p_4_7, p_0_3);
+
+  /* Finalize. y = shift + z + z^3 * P(z^2).  */
+  sv_f32_t z3 = svmul_f32_x (pg, z2, az);
+  y = sv_fma_f32_x (pg, y, z3, az);
+
+  /* Apply shift as indicated by 'red' predicate.  */
+  y = svadd_f32_m (red, y, shift);
+
+  return y;
+}
+
+#endif // PL_MATH_SV_ATANF_COMMON_H
diff --git a/pl/math/sv_cos_2u5.c b/pl/math/sv_cos_2u5.c
new file mode 100644
index 0000000..1940348
--- /dev/null
+++ b/pl/math/sv_cos_2u5.c
@@ -0,0 +1,84 @@
+/*
+ * Double-precision SVE cos(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define InvPio2 (sv_f64 (0x1.45f306dc9c882p-1))
+#define NegPio2_1 (sv_f64 (-0x1.921fb50000000p+0))
+#define NegPio2_2 (sv_f64 (-0x1.110b460000000p-26))
+#define NegPio2_3 (sv_f64 (-0x1.1a62633145c07p-54))
+/* Original shift used in Neon cos,
+   plus a contribution to set the bit #0 of q
+   as expected by trigonometric instructions.  */
+#define Shift (sv_f64 (0x1.8000000000001p52))
+#define RangeVal (sv_f64 (0x1p23))
+#define AbsMask (0x7fffffffffffffff)
+
+static NOINLINE sv_f64_t
+__sv_cos_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp)
+{
+  return sv_call_f64 (cos, x, y, cmp);
+}
+
+/* A fast SVE implementation of cos based on trigonometric
+   instructions (FTMAD, FTSSEL, FTSMUL).
+   Maximum measured error: 2.108 ULPs.
+   __sv_cos(0x1.9b0ba158c98f3p+7) got -0x1.fddd4c65c7f07p-3
+				 want -0x1.fddd4c65c7f05p-3.  */
+sv_f64_t
+__sv_cos_x (sv_f64_t x, const svbool_t pg)
+{
+  sv_f64_t n, r, r2, y;
+  svbool_t cmp;
+
+  r = sv_as_f64_u64 (svand_n_u64_x (pg, sv_as_u64_f64 (x), AbsMask));
+  cmp = svcmpge_u64 (pg, sv_as_u64_f64 (r), sv_as_u64_f64 (RangeVal));
+
+  /* n = rint(|x|/(pi/2)).  */
+  sv_f64_t q = sv_fma_f64_x (pg, InvPio2, r, Shift);
+  n = svsub_f64_x (pg, q, Shift);
+
+  /* r = |x| - n*(pi/2)  (range reduction into -pi/4 .. pi/4).  */
+  r = sv_fma_f64_x (pg, NegPio2_1, n, r);
+  r = sv_fma_f64_x (pg, NegPio2_2, n, r);
+  r = sv_fma_f64_x (pg, NegPio2_3, n, r);
+
+  /* cos(r) poly approx.  */
+  r2 = svtsmul_f64 (r, sv_as_u64_f64 (q));
+  y = sv_f64 (0.0);
+  y = svtmad_f64 (y, r2, 7);
+  y = svtmad_f64 (y, r2, 6);
+  y = svtmad_f64 (y, r2, 5);
+  y = svtmad_f64 (y, r2, 4);
+  y = svtmad_f64 (y, r2, 3);
+  y = svtmad_f64 (y, r2, 2);
+  y = svtmad_f64 (y, r2, 1);
+  y = svtmad_f64 (y, r2, 0);
+
+  /* Final multiplicative factor: 1.0 or x depending on bit #0 of q.  */
+  sv_f64_t f = svtssel_f64 (r, sv_as_u64_f64 (q));
+  /* Apply factor.  */
+  y = svmul_f64_x (pg, f, y);
+
+  /* No need to pass pg to specialcase here since cmp is a strict subset,
+     guaranteed by the cmpge above.  */
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_cos_specialcase (x, y, cmp);
+  return y;
+}
+
+PL_ALIAS (__sv_cos_x, _ZGVsMxv_cos)
+
+PL_SIG (SV, D, 1, cos, -3.1, 3.1)
+PL_TEST_ULP (__sv_cos, 1.61)
+PL_TEST_INTERVAL (__sv_cos, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (__sv_cos, 0x1p-4, 0x1p4, 500000)
+#endif
diff --git a/pl/math/sv_cosf_2u1.c b/pl/math/sv_cosf_2u1.c
new file mode 100644
index 0000000..8f138bc
--- /dev/null
+++ b/pl/math/sv_cosf_2u1.c
@@ -0,0 +1,82 @@
+/*
+ * Single-precision SVE cos(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define NegPio2_1 (sv_f32 (-0x1.921fb6p+0f))
+#define NegPio2_2 (sv_f32 (0x1.777a5cp-25f))
+#define NegPio2_3 (sv_f32 (0x1.ee59dap-50f))
+#define RangeVal (sv_f32 (0x1p20f))
+#define InvPio2 (sv_f32 (0x1.45f306p-1f))
+/* Original shift used in Neon cosf,
+   plus a contribution to set the bit #0 of q
+   as expected by trigonometric instructions.  */
+#define Shift (sv_f32 (0x1.800002p+23f))
+#define AbsMask (0x7fffffff)
+
+static NOINLINE sv_f32_t
+__sv_cosf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+  return sv_call_f32 (cosf, x, y, cmp);
+}
+
+/* A fast SVE implementation of cosf based on trigonometric
+   instructions (FTMAD, FTSSEL, FTSMUL).
+   Maximum measured error: 2.06 ULPs.
+   __sv_cosf(0x1.dea2f2p+19) got 0x1.fffe7ap-6
+			    want 0x1.fffe76p-6.  */
+sv_f32_t
+__sv_cosf_x (sv_f32_t x, const svbool_t pg)
+{
+  sv_f32_t n, r, r2, y;
+  svbool_t cmp;
+
+  r = sv_as_f32_u32 (svand_n_u32_x (pg, sv_as_u32_f32 (x), AbsMask));
+  cmp = svcmpge_u32 (pg, sv_as_u32_f32 (r), sv_as_u32_f32 (RangeVal));
+
+  /* n = rint(|x|/(pi/2)).  */
+  sv_f32_t q = sv_fma_f32_x (pg, InvPio2, r, Shift);
+  n = svsub_f32_x (pg, q, Shift);
+
+  /* r = |x| - n*(pi/2)  (range reduction into -pi/4 .. pi/4).  */
+  r = sv_fma_f32_x (pg, NegPio2_1, n, r);
+  r = sv_fma_f32_x (pg, NegPio2_2, n, r);
+  r = sv_fma_f32_x (pg, NegPio2_3, n, r);
+
+  /* Final multiplicative factor: 1.0 or x depending on bit #0 of q.  */
+  sv_f32_t f = svtssel_f32 (r, sv_as_u32_f32 (q));
+
+  /* cos(r) poly approx.  */
+  r2 = svtsmul_f32 (r, sv_as_u32_f32 (q));
+  y = sv_f32 (0.0f);
+  y = svtmad_f32 (y, r2, 4);
+  y = svtmad_f32 (y, r2, 3);
+  y = svtmad_f32 (y, r2, 2);
+  y = svtmad_f32 (y, r2, 1);
+  y = svtmad_f32 (y, r2, 0);
+
+  /* Apply factor.  */
+  y = svmul_f32_x (pg, f, y);
+
+  /* No need to pass pg to specialcase here since cmp is a strict subset,
+     guaranteed by the cmpge above.  */
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_cosf_specialcase (x, y, cmp);
+  return y;
+}
+
+PL_ALIAS (__sv_cosf_x, _ZGVsMxv_cosf)
+
+PL_SIG (SV, F, 1, cos, -3.1, 3.1)
+PL_TEST_ULP (__sv_cosf, 1.57)
+PL_TEST_INTERVAL (__sv_cosf, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (__sv_cosf, 0x1p-4, 0x1p4, 500000)
+#endif
diff --git a/pl/math/sv_erf_3u.c b/pl/math/sv_erf_3u.c
new file mode 100644
index 0000000..bec7f8a
--- /dev/null
+++ b/pl/math/sv_erf_3u.c
@@ -0,0 +1,103 @@
+/*
+ * Double-precision SVE erf(x) function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define Scale (8.0)
+#define AbsMask (0x7fffffffffffffff)
+
+static NOINLINE sv_f64_t
+__sv_erf_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp)
+{
+  return sv_call_f64 (erf, x, y, cmp);
+}
+
+/* Optimized double precision SVE error function erf.
+   Maximum observed error is 2.62 ULP:
+   __sv_erf(0x1.79cab7e3078fap+2) got 0x1.0000000000001p+0
+				 want 0x1.fffffffffffffp-1.  */
+sv_f64_t
+__sv_erf_x (sv_f64_t x, const svbool_t pg)
+{
+  /* Use top 16 bits to test for special cases and small values.  */
+  sv_u64_t ix = sv_as_u64_f64 (x);
+  sv_u64_t atop = svand_n_u64_x (pg, svlsr_n_u64_x (pg, ix, 48), 0x7fff);
+
+  /* Handle both inf/nan as well as small values (|x|<2^-28).  */
+  svbool_t cmp
+    = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, atop, 0x3e30), 0x7ff0 - 0x3e30);
+
+  /* Get sign and absolute value.  */
+  sv_f64_t a = sv_as_f64_u64 (svand_n_u64_x (pg, ix, AbsMask));
+  sv_u64_t sign = svand_n_u64_x (pg, ix, ~AbsMask);
+
+  /* i = trunc(Scale*x).  */
+  sv_f64_t a_scale = svmul_n_f64_x (pg, a, Scale);
+  /* Saturate index of intervals.  */
+  svbool_t a_lt_6 = svcmplt_n_u64 (pg, atop, 0x4018);
+  sv_u64_t i = svcvt_u64_f64_m (sv_u64 (V_ERF_NINTS - 1), a_lt_6, a_scale);
+
+  /* Load polynomial coefficients.  */
+  sv_f64_t P_0 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[0], i);
+  sv_f64_t P_1 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[1], i);
+  sv_f64_t P_2 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[2], i);
+  sv_f64_t P_3 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[3], i);
+  sv_f64_t P_4 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[4], i);
+  sv_f64_t P_5 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[5], i);
+  sv_f64_t P_6 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[6], i);
+  sv_f64_t P_7 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[7], i);
+  sv_f64_t P_8 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[8], i);
+  sv_f64_t P_9 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[9], i);
+
+  /* Get shift and scale.  */
+  sv_f64_t shift = sv_lookup_f64_x (pg, __v_erf_data.shifts, i);
+
+  /* Transform polynomial variable.
+     Set z = 0 in the boring domain to avoid overflow.  */
+  sv_f64_t z = svmla_f64_m (a_lt_6, shift, sv_f64 (Scale), a);
+
+  /* Evaluate polynomial P(z) using level-2 Estrin.  */
+  sv_f64_t r1 = sv_fma_f64_x (pg, z, P_1, P_0);
+  sv_f64_t r2 = sv_fma_f64_x (pg, z, P_3, P_2);
+  sv_f64_t r3 = sv_fma_f64_x (pg, z, P_5, P_4);
+  sv_f64_t r4 = sv_fma_f64_x (pg, z, P_7, P_6);
+  sv_f64_t r5 = sv_fma_f64_x (pg, z, P_9, P_8);
+
+  sv_f64_t z2 = svmul_f64_x (pg, z, z);
+  sv_f64_t z4 = svmul_f64_x (pg, z2, z2);
+
+  sv_f64_t q2 = sv_fma_f64_x (pg, r4, z2, r3);
+  sv_f64_t q1 = sv_fma_f64_x (pg, r2, z2, r1);
+
+  sv_f64_t y = sv_fma_f64_x (pg, z4, r5, q2);
+  y = sv_fma_f64_x (pg, z4, y, q1);
+
+  /* y = erf(x) if x > 0, -erf(-x) otherwise.  */
+  y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign));
+
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_erf_specialcase (x, y, cmp);
+  return y;
+}
+
+PL_ALIAS (__sv_erf_x, _ZGVsMxv_erf)
+
+PL_SIG (SV, D, 1, erf, -4.0, 4.0)
+PL_TEST_ULP (__sv_erf, 2.13)
+PL_TEST_INTERVAL (__sv_erf, 0, 0x1p-28, 20000)
+PL_TEST_INTERVAL (__sv_erf, 0x1p-28, 1, 60000)
+PL_TEST_INTERVAL (__sv_erf, 1, 0x1p28, 60000)
+PL_TEST_INTERVAL (__sv_erf, 0x1p28, inf, 20000)
+PL_TEST_INTERVAL (__sv_erf, -0, -0x1p-28, 20000)
+PL_TEST_INTERVAL (__sv_erf, -0x1p-28, -1, 60000)
+PL_TEST_INTERVAL (__sv_erf, -1, -0x1p28, 60000)
+PL_TEST_INTERVAL (__sv_erf, -0x1p28, -inf, 20000)
+#endif
diff --git a/pl/math/sv_erfc_4u.c b/pl/math/sv_erfc_4u.c
new file mode 100644
index 0000000..076b471
--- /dev/null
+++ b/pl/math/sv_erfc_4u.c
@@ -0,0 +1,146 @@
+/*
+ * Double-precision SVE erfc(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+#include "sv_exp_tail.h"
+
+sv_f64_t __sv_exp_x (sv_f64_t, svbool_t);
+
+static NOINLINE sv_f64_t
+specialcase (sv_f64_t x, sv_f64_t y, svbool_t special)
+{
+  return sv_call_f64 (erfc, x, y, special);
+}
+
+static inline sv_u64_t
+lookup_interval_idx (const svbool_t pg, sv_f64_t abs_x)
+{
+  /* Interval index is calculated by (((abs(x) + 1)^4) >> 53) - 1023, bounded by
+     the number of polynomials.  */
+  sv_f64_t xp1 = svadd_n_f64_x (pg, abs_x, 1);
+  xp1 = svmul_f64_x (pg, xp1, xp1);
+  xp1 = svmul_f64_x (pg, xp1, xp1);
+  sv_u64_t interval_idx
+    = svsub_n_u64_x (pg, svlsr_n_u64_x (pg, sv_as_u64_f64 (xp1), 52), 1023);
+  return svsel_u64 (svcmple_n_u64 (pg, interval_idx, ERFC_NUM_INTERVALS),
+		    interval_idx, sv_u64 (ERFC_NUM_INTERVALS));
+}
+
+static inline sv_f64_t
+sv_eval_poly (const svbool_t pg, sv_f64_t z, sv_u64_t idx)
+{
+  sv_u64_t offset = svmul_n_u64_x (pg, idx, ERFC_POLY_ORDER + 1);
+  const double *base = &__v_erfc_data.poly[0][12];
+  sv_f64_t r = sv_lookup_f64_x (pg, base, offset);
+  for (int i = 0; i < ERFC_POLY_ORDER; i++)
+    {
+      base--;
+      sv_f64_t c = sv_lookup_f64_x (pg, base, offset);
+      r = sv_fma_f64_x (pg, z, r, c);
+    }
+  return r;
+}
+
+static inline sv_f64_t
+sv_eval_gauss (const svbool_t pg, sv_f64_t abs_x)
+{
+  /* Accurate evaluation of exp(-x^2). This operation is sensitive to rounding
+     errors in x^2, so we compute an estimate for the error and use a custom exp
+     helper which corrects for the calculated error estimate.  */
+  sv_f64_t a2 = svmul_f64_x (pg, abs_x, abs_x);
+
+  /* Split abs_x into (a_hi + a_lo), where a_hi is the 'large' component and
+     a_lo is the 'small' component.  */
+  const sv_f64_t scale = sv_f64 (0x1.0000002p27);
+  sv_f64_t a_hi = svneg_f64_x (pg, sv_fma_f64_x (pg, scale, abs_x,
+						 svneg_f64_x (pg, abs_x)));
+  a_hi = sv_fma_f64_x (pg, scale, abs_x, a_hi);
+  sv_f64_t a_lo = svsub_f64_x (pg, abs_x, a_hi);
+
+  sv_f64_t a_hi_neg = svneg_f64_x (pg, a_hi);
+  sv_f64_t a_lo_neg = svneg_f64_x (pg, a_lo);
+
+  /* We can then estimate the error in abs_x^2 by computing (abs_x * abs_x) -
+     (a_hi + a_lo) * (a_hi + a_lo).  */
+  sv_f64_t e2 = sv_fma_f64_x (pg, a_hi_neg, a_hi, a2);
+  e2 = sv_fma_f64_x (pg, a_hi_neg, a_lo, e2);
+  e2 = sv_fma_f64_x (pg, a_lo_neg, a_hi, e2);
+  e2 = sv_fma_f64_x (pg, a_lo_neg, a_lo, e2);
+
+  return sv_exp_tail (pg, svneg_f64_x (pg, a2), e2);
+}
+
+/* Optimized double precision vector complementary error function erfc.
+   Maximum measured error is 3.64 ULP:
+   __sv_erfc(0x1.4792573ee6cc7p+2) got 0x1.ff3f4c8e200d5p-42
+				  want 0x1.ff3f4c8e200d9p-42.  */
+sv_f64_t
+__sv_erfc_x (sv_f64_t x, const svbool_t pg)
+{
+  sv_u64_t ix = sv_as_u64_f64 (x);
+  sv_f64_t abs_x = svabs_f64_x (pg, x);
+  sv_u64_t atop = svlsr_n_u64_x (pg, sv_as_u64_f64 (abs_x), 52);
+
+  /* Outside of the 'interesting' bounds, [-6, 28], +ve goes to 0, -ve goes
+     to 2. As long as the polynomial is 0 in the boring zone, we can assemble
+     the result correctly. This is dealt with in two ways:
+
+     The 'coarse approach' is that the approximation algorithm is
+     zero-predicated on in_bounds = |x| < 32, which saves the need to do
+     coefficient lookup etc for |x| >= 32.
+
+     The coarse approach misses [-32, -6] and [28, 32], which are dealt with in
+     the polynomial and index calculation, such that the polynomial evaluates to
+     0 in these regions.  */
+  /* in_bounds is true for lanes where |x| < 32.  */
+  svbool_t in_bounds = svcmplt_n_u64 (pg, atop, 0x404);
+  /* boring_zone = 2 for x < 0, 0 otherwise.  */
+  sv_f64_t boring_zone
+    = sv_as_f64_u64 (svlsl_n_u64_x (pg, svlsr_n_u64_x (pg, ix, 63), 62));
+  /* Very small, nan and inf.  */
+  svbool_t special_cases
+    = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, atop, 0x3cd), 0x432);
+
+  /* erfc(|x|) ~= P_i(|x|-x_i)*exp(-x^2)
+
+     Where P_i is a polynomial and x_i is an offset, both defined in
+     v_erfc_data.c. i is chosen based on which interval x falls in.  */
+  sv_u64_t i = lookup_interval_idx (in_bounds, abs_x);
+  sv_f64_t x_i = sv_lookup_f64_x (in_bounds, __v_erfc_data.interval_bounds, i);
+  sv_f64_t p = sv_eval_poly (in_bounds, svsub_f64_x (pg, abs_x, x_i), i);
+  /* 'copy' sign of x to p, i.e. negate p if x is negative.  */
+  sv_u64_t sign = svbic_n_u64_z (in_bounds, ix, 0x7fffffffffffffff);
+  p = sv_as_f64_u64 (sveor_u64_z (in_bounds, sv_as_u64_f64 (p), sign));
+
+  sv_f64_t e = sv_eval_gauss (in_bounds, abs_x);
+
+  /* Assemble result: 2-p*e if x<0, p*e otherwise. No need to conditionally
+     select boring_zone because P[V_ERFC_NINTS-1]=0.  */
+  sv_f64_t y = sv_fma_f64_x (pg, p, e, boring_zone);
+
+  if (unlikely (svptest_any (pg, special_cases)))
+    {
+      return specialcase (x, y, special_cases);
+    }
+  return y;
+}
+
+PL_ALIAS (__sv_erfc_x, _ZGVsMxv_erfc)
+
+PL_SIG (SV, D, 1, erfc, -4.0, 10.0)
+PL_TEST_ULP (__sv_erfc, 3.15)
+PL_TEST_INTERVAL (__sv_erfc, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (__sv_erfc, 0x1p-127, 0x1p-26, 40000)
+PL_TEST_INTERVAL (__sv_erfc, -0x1p-127, -0x1p-26, 40000)
+PL_TEST_INTERVAL (__sv_erfc, 0x1p-26, 0x1p5, 40000)
+PL_TEST_INTERVAL (__sv_erfc, -0x1p-26, -0x1p3, 40000)
+PL_TEST_INTERVAL (__sv_erfc, 0, inf, 40000)
+#endif
diff --git a/pl/math/sv_erff_1u3.c b/pl/math/sv_erff_1u3.c
new file mode 100644
index 0000000..c7a738c
--- /dev/null
+++ b/pl/math/sv_erff_1u3.c
@@ -0,0 +1,104 @@
+/*
+ * Single-precision vector erf(x) function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define AbsMask (0x7fffffff)
+
+static NOINLINE sv_f32_t
+__sv_erff_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+  return sv_call_f32 (erff, x, y, cmp);
+}
+
+sv_f32_t __sv_expf_x (svbool_t, sv_f32_t);
+
+/* Optimized single precision vector erf. Worst-case error is 1.25 ULP:
+   __sv_erff(0x1.dc59fap-1) got 0x1.9f9c88p-1
+			   want 0x1.9f9c8ap-1.  */
+sv_f32_t
+__sv_erff_x (sv_f32_t x, const svbool_t pg)
+{
+  sv_u32_t ix = sv_as_u32_f32 (x);
+  sv_u32_t atop = svand_n_u32_x (pg, svlsr_n_u32_x (pg, ix, 16), 0x7fff);
+  /* Handle both inf/nan as well as small values (|x|<2^-28).  */
+  svbool_t cmp
+    = svcmpge_n_u32 (pg, svsub_n_u32_x (pg, atop, 0x3180), 0x7ff0 - 0x3180);
+
+  sv_u32_t sign = svand_n_u32_x (pg, ix, ~AbsMask);
+  /* |x| < 0.921875.  */
+  svbool_t red = svaclt_n_f32 (pg, x, 0.921875f);
+  /* |x| > 4.0.  */
+  svbool_t bor = svacgt_n_f32 (pg, x, 4.0f);
+
+  /* Load polynomial coefficients.  */
+  sv_u32_t idx_lo = svsel (red, sv_u32 (0), sv_u32 (1));
+  sv_u32_t idx_hi = svadd_n_u32_x (pg, idx_lo, 2);
+
+  const float *base = (float *) __v_erff_data.coeffs;
+  sv_f32_t c_2_5 = svld1rq (svptrue_b32 (), base + 2);
+  sv_f32_t c_6_9 = svld1rq (svptrue_b32 (), base + 6);
+  sv_f32_t c_10_13 = svld1rq (svptrue_b32 (), base + 10);
+
+  /* Do not need to store elem 0 of __v_erff_data as it is not used.  */
+  sv_f32_t p1 = svtbl (c_2_5, idx_lo);
+  sv_f32_t p2 = svtbl (c_2_5, idx_hi);
+  sv_f32_t p3 = svtbl (c_6_9, idx_lo);
+  sv_f32_t p4 = svtbl (c_6_9, idx_hi);
+  sv_f32_t p5 = svtbl (c_10_13, idx_lo);
+  sv_f32_t p6 = svtbl (c_10_13, idx_hi);
+
+  sv_f32_t a = svabs_f32_x (pg, x);
+  /* Square with merging mul - z is x^2 for reduced, |x| otherwise.  */
+  sv_f32_t z = svmul_f32_m (red, a, a);
+
+  /* Evaluate polynomial on |x| or x^2.  */
+  sv_f32_t r = sv_fma_f32_x (pg, z, p6, p5);
+  r = sv_fma_f32_x (pg, z, r, p4);
+  r = sv_fma_f32_x (pg, z, r, p3);
+  r = sv_fma_f32_x (pg, z, r, p2);
+  r = sv_fma_f32_x (pg, z, r, p1);
+  /* Use merging svmad for last operation - apply first coefficient if not
+     reduced, otherwise r is propagated unchanged. This is because the reduced
+     polynomial has lower order than the non-reduced.  */
+  r = svmad_n_f32_m (svnot_b_z (pg, red), r, z, base[1]);
+  r = sv_fma_f32_x (pg, a, r, a);
+
+  /* y = |x| + |x| * P(x^2)               if |x| < 0.921875
+     y = 1 - exp (-(|x| + |x| * P(|x|)))  otherwise.  */
+  sv_f32_t y = __sv_expf_x (pg, svneg_f32_x (pg, r));
+  y = svsel_f32 (red, r, svsubr_n_f32_x (pg, y, 1.0));
+
+  /* Boring domain (absolute value is required to get the sign of erf(-nan)
+     right).  */
+  y = svsel_f32 (bor, sv_f32 (1.0f), svabs_f32_x (pg, y));
+
+  /* y = erf(x) if x>0, -erf(-x) otherwise.  */
+  y = sv_as_f32_u32 (sveor_u32_x (pg, sv_as_u32_f32 (y), sign));
+
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_erff_specialcase (x, y, cmp);
+  return y;
+}
+
+PL_ALIAS (__sv_erff_x, _ZGVsMxv_erff)
+
+PL_SIG (SV, F, 1, erf, -4.0, 4.0)
+PL_TEST_ULP (__sv_erff, 0.76)
+PL_TEST_INTERVAL (__sv_erff, 0, 0x1p-28, 20000)
+PL_TEST_INTERVAL (__sv_erff, 0x1p-28, 1, 60000)
+PL_TEST_INTERVAL (__sv_erff, 1, 0x1p28, 60000)
+PL_TEST_INTERVAL (__sv_erff, 0x1p28, inf, 20000)
+PL_TEST_INTERVAL (__sv_erff, -0, -0x1p-28, 20000)
+PL_TEST_INTERVAL (__sv_erff, -0x1p-28, -1, 60000)
+PL_TEST_INTERVAL (__sv_erff, -1, -0x1p28, 60000)
+PL_TEST_INTERVAL (__sv_erff, -0x1p28, -inf, 20000)
+#endif
diff --git a/pl/math/sv_exp_tail.h b/pl/math/sv_exp_tail.h
new file mode 100644
index 0000000..9b739da
--- /dev/null
+++ b/pl/math/sv_exp_tail.h
@@ -0,0 +1,79 @@
+/*
+ * Double-precision SVE e^(x+tail) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef SV_EXP_TAIL_H
+#define SV_EXP_TAIL_H
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+#include "v_exp_tail.h"
+
+#define C1 sv_f64 (C1_scal)
+#define C2 sv_f64 (C2_scal)
+#define C3 sv_f64 (C3_scal)
+#define MinusLn2hi (-Ln2hi_scal)
+#define MinusLn2lo (-Ln2lo_scal)
+
+#define N (1 << V_EXP_TAIL_TABLE_BITS)
+#define Tab __v_exp_tail_data
+#define IndexMask (N - 1)
+#define Shift sv_f64 (0x1.8p+52)
+#define Thres 704.0
+
+static inline sv_f64_t
+sv_exp_tail_special_case (svbool_t pg, sv_f64_t s, sv_f64_t y, sv_f64_t n)
+{
+  sv_f64_t absn = svabs_f64_x (pg, n);
+
+  /* 2^(n/N) may overflow, break it up into s1*s2.  */
+  sv_u64_t b = svsel_u64 (svcmple_n_f64 (pg, n, 0), sv_u64 (0x6000000000000000),
+			  sv_u64 (0));
+  sv_f64_t s1 = sv_as_f64_u64 (svsubr_n_u64_x (pg, b, 0x7000000000000000));
+  sv_f64_t s2 = sv_as_f64_u64 (
+    svadd_u64_x (pg, svsub_n_u64_x (pg, sv_as_u64_f64 (s), 0x3010000000000000),
+		 b));
+
+  svbool_t cmp = svcmpgt_n_f64 (pg, absn, 1280.0 * N);
+  sv_f64_t r1 = svmul_f64_x (pg, s1, s1);
+  sv_f64_t r0 = svmul_f64_x (pg, sv_fma_f64_x (pg, y, s2, s2), s1);
+  return svsel_f64 (cmp, r1, r0);
+}
+
+static inline sv_f64_t
+sv_exp_tail (const svbool_t pg, sv_f64_t x, sv_f64_t xtail)
+{
+  /* Calculate exp(x + xtail).  */
+  sv_f64_t z = sv_fma_n_f64_x (pg, InvLn2_scal, x, Shift);
+  sv_f64_t n = svsub_f64_x (pg, z, Shift);
+
+  sv_f64_t r = sv_fma_n_f64_x (pg, MinusLn2hi, n, x);
+  r = sv_fma_n_f64_x (pg, MinusLn2lo, n, r);
+
+  sv_u64_t u = sv_as_u64_f64 (z);
+  sv_u64_t e = svlsl_n_u64_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
+  sv_u64_t i = svand_n_u64_x (pg, u, IndexMask);
+
+  sv_f64_t y = sv_fma_f64_x (pg, C3, r, C2);
+  y = sv_fma_f64_x (pg, y, r, C1);
+  y = sv_fma_f64_x (pg, y, r, sv_f64 (1.0));
+  y = sv_fma_f64_x (pg, y, r, xtail);
+
+  /* s = 2^(n/N).  */
+  u = sv_lookup_u64_x (pg, Tab, i);
+  sv_f64_t s = sv_as_f64_u64 (svadd_u64_x (pg, u, e));
+
+  svbool_t cmp = svcmpgt_n_f64 (pg, svabs_f64_x (pg, x), Thres);
+  if (unlikely (svptest_any (pg, cmp)))
+    {
+      return sv_exp_tail_special_case (pg, s, y, n);
+    }
+  return sv_fma_f64_x (pg, y, s, s);
+}
+
+#endif
+#endif
diff --git a/pl/math/sv_expf_2u.c b/pl/math/sv_expf_2u.c
new file mode 100644
index 0000000..87fbe45
--- /dev/null
+++ b/pl/math/sv_expf_2u.c
@@ -0,0 +1,156 @@
+/*
+ * Single-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define C(i) __sv_expf_poly[i]
+
+#define InvLn2 (0x1.715476p+0f)
+#define Ln2hi (0x1.62e4p-1f)
+#define Ln2lo (0x1.7f7d1cp-20f)
+
+#if SV_EXPF_USE_FEXPA
+
+#define Shift (0x1.903f8p17f) /* 1.5*2^17 + 127.  */
+#define Thres                                                                  \
+  (0x1.5d5e2ap+6f) /* Roughly 87.3. For x < -Thres, the result is subnormal    \
+		      and not handled correctly by FEXPA.  */
+
+static NOINLINE sv_f32_t
+special_case (sv_f32_t x, sv_f32_t y, svbool_t special)
+{
+  /* The special-case handler from the Neon routine does not handle subnormals
+     in a way that is compatible with FEXPA. For the FEXPA variant we just fall
+     back to scalar expf.  */
+  return sv_call_f32 (expf, x, y, special);
+}
+
+#else
+
+#define Shift (0x1.8p23f) /* 1.5 * 2^23.  */
+#define Thres (126.0f)
+
+/* Special-case handler adapted from Neon variant. Uses s, y and n to produce
+   the final result (normal cases included). It performs an update of all lanes!
+   Therefore:
+   - all previous computation need to be done on all lanes indicated by input
+     pg
+   - we cannot simply apply the special case to the special-case-activated
+     lanes. Besides it is likely that this would not increase performance (no
+     scatter/gather).  */
+static inline sv_f32_t
+specialcase (svbool_t pg, sv_f32_t poly, sv_f32_t n, sv_u32_t e,
+	     svbool_t p_cmp1, sv_f32_t scale)
+{
+  /* s=2^(n/N) may overflow, break it up into s=s1*s2,
+     such that exp = s + s*y can be computed as s1*(s2+s2*y)
+     and s1*s1 overflows only if n>0.  */
+
+  /* If n<=0 then set b to 0x820...0, 0 otherwise.  */
+  svbool_t p_sign = svcmple_n_f32 (pg, n, 0.0f); /* n <= 0.  */
+  sv_u32_t b
+    = svdup_n_u32_z (p_sign, 0x82000000); /* Inactive lanes set to 0.  */
+
+  /* Set s1 to generate overflow depending on sign of exponent n.  */
+  sv_f32_t s1
+    = sv_as_f32_u32 (svadd_n_u32_x (pg, b, 0x7f000000)); /* b + 0x7f000000.  */
+  /* Offset s to avoid overflow in final result if n is below threshold.  */
+  sv_f32_t s2 = sv_as_f32_u32 (
+    svsub_u32_x (pg, e, b)); /* as_u32 (s) - 0x3010...0 + b.  */
+
+  /* |n| > 192 => 2^(n/N) overflows.  */
+  svbool_t p_cmp2 = svacgt_n_f32 (pg, n, 192.0f);
+
+  sv_f32_t r2 = svmul_f32_x (pg, s1, s1);
+  sv_f32_t r1 = sv_fma_f32_x (pg, poly, s2, s2);
+  r1 = svmul_f32_x (pg, r1, s1);
+  sv_f32_t r0 = sv_fma_f32_x (pg, poly, scale, scale);
+
+  /* Apply condition 1 then 2.
+     Returns r2 if cond2 is true, otherwise
+     if cond1 is true then return r1, otherwise return r0.  */
+  sv_f32_t r = svsel_f32 (p_cmp1, r1, r0);
+
+  return svsel_f32 (p_cmp2, r2, r);
+}
+
+#endif
+
+/* Optimised single-precision SVE exp function. By default this is an SVE port
+   of the Neon algorithm from math/. Alternatively, enable a modification of
+   that algorithm that looks up scale using SVE FEXPA instruction with
+   SV_EXPF_USE_FEXPA.
+
+   Worst-case error of the default algorithm is 1.95 ulp:
+   __sv_expf(-0x1.4cb74ap+2) got 0x1.6a022cp-8
+			     want 0x1.6a023p-8.
+
+   Worst-case error when using FEXPA is 1.04 ulp:
+   __sv_expf(0x1.a8eda4p+1) got 0x1.ba74bcp+4
+			   want 0x1.ba74bap+4.  */
+sv_f32_t
+__sv_expf_x (sv_f32_t x, const svbool_t pg)
+{
+  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+
+  /* n = round(x/(ln2/N)).  */
+  sv_f32_t z = sv_fma_n_f32_x (pg, InvLn2, x, sv_f32 (Shift));
+  sv_f32_t n = svsub_n_f32_x (pg, z, Shift);
+
+  /* r = x - n*ln2/N.  */
+  sv_f32_t r = sv_fma_n_f32_x (pg, -Ln2hi, n, x);
+  r = sv_fma_n_f32_x (pg, -Ln2lo, n, r);
+
+/* scale = 2^(n/N).  */
+#if SV_EXPF_USE_FEXPA
+  /* NaNs also need special handling with FEXPA.  */
+  svbool_t is_special_case
+    = svorr_b_z (pg, svacgt_n_f32 (pg, x, Thres), svcmpne_f32 (pg, x, x));
+  sv_f32_t scale = svexpa_f32 (sv_as_u32_f32 (z));
+#else
+  sv_u32_t e = svlsl_n_u32_x (pg, sv_as_u32_f32 (z), 23);
+  svbool_t is_special_case = svacgt_n_f32 (pg, n, Thres);
+  sv_f32_t scale = sv_as_f32_u32 (svadd_n_u32_x (pg, e, 0x3f800000));
+#endif
+
+  /* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4.  */
+  sv_f32_t r2 = svmul_f32_x (pg, r, r);
+  sv_f32_t p = sv_fma_n_f32_x (pg, C (0), r, sv_f32 (C (1)));
+  sv_f32_t q = sv_fma_n_f32_x (pg, C (2), r, sv_f32 (C (3)));
+  q = sv_fma_f32_x (pg, p, r2, q);
+  p = svmul_n_f32_x (pg, r, C (4));
+  sv_f32_t poly = sv_fma_f32_x (pg, q, r2, p);
+
+  if (unlikely (svptest_any (pg, is_special_case)))
+#if SV_EXPF_USE_FEXPA
+    return special_case (x, sv_fma_f32_x (pg, poly, scale, scale),
+			 is_special_case);
+#else
+    return specialcase (pg, poly, n, e, is_special_case, scale);
+#endif
+
+  return sv_fma_f32_x (pg, poly, scale, scale);
+}
+
+PL_ALIAS (__sv_expf_x, _ZGVsMxv_expf)
+
+PL_SIG (SV, F, 1, exp, -9.9, 9.9)
+PL_TEST_ULP (__sv_expf, 1.46)
+PL_TEST_INTERVAL (__sv_expf, 0, 0x1p-23, 40000)
+PL_TEST_INTERVAL (__sv_expf, 0x1p-23, 1, 50000)
+PL_TEST_INTERVAL (__sv_expf, 1, 0x1p23, 50000)
+PL_TEST_INTERVAL (__sv_expf, 0x1p23, inf, 50000)
+PL_TEST_INTERVAL (__sv_expf, -0, -0x1p-23, 40000)
+PL_TEST_INTERVAL (__sv_expf, -0x1p-23, -1, 50000)
+PL_TEST_INTERVAL (__sv_expf, -1, -0x1p23, 50000)
+PL_TEST_INTERVAL (__sv_expf, -0x1p23, -inf, 50000)
+#endif // SV_SUPPORTED
diff --git a/pl/math/sv_expf_data.c b/pl/math/sv_expf_data.c
new file mode 100644
index 0000000..6875adf
--- /dev/null
+++ b/pl/math/sv_expf_data.c
@@ -0,0 +1,12 @@
+/*
+ * Coefficients for single-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Coefficients copied from the polynomial in math/v_expf.c.  */
+const float __sv_expf_poly[] = {0x1.0e4020p-7f, 0x1.573e2ep-5f, 0x1.555e66p-3f,
+				0x1.fffdb6p-2f, 0x1.ffffecp-1f};
diff --git a/pl/math/sv_log10_2u5.c b/pl/math/sv_log10_2u5.c
new file mode 100644
index 0000000..884e201
--- /dev/null
+++ b/pl/math/sv_log10_2u5.c
@@ -0,0 +1,89 @@
+/*
+ * Double-precision SVE log10(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define OFF 0x3fe6900900000000
+#define N (1 << V_LOG10_TABLE_BITS)
+
+#define A(i) __v_log10_data.poly[i]
+
+static inline sv_f64_t
+specialcase (sv_f64_t x, sv_f64_t y, svbool_t special)
+{
+  return sv_call_f64 (log10, x, y, special);
+}
+
+/* SVE log10 algorithm. Maximum measured error is 2.46 ulps.
+   __sv_log10(0x1.131956cd4b627p+0) got 0x1.fffbdf6eaa669p-6
+				   want 0x1.fffbdf6eaa667p-6.  */
+sv_f64_t
+__sv_log10_x (sv_f64_t x, const svbool_t pg)
+{
+  sv_u64_t ix = sv_as_u64_f64 (x);
+  sv_u64_t top = svlsr_n_u64_x (pg, ix, 48);
+
+  svbool_t is_special_case
+    = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, top, 0x0010), 0x07ff0 - 0x0010);
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  sv_u64_t tmp = svsub_n_u64_x (pg, ix, OFF);
+  sv_u64_t i
+    = sv_mod_n_u64_x (pg, svlsr_n_u64_x (pg, tmp, 52 - V_LOG10_TABLE_BITS), N);
+  sv_f64_t k
+    = sv_to_f64_s64_x (pg, svasr_n_s64_x (pg, sv_as_s64_u64 (tmp), 52));
+  sv_f64_t z = sv_as_f64_u64 (
+    svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52)));
+
+  /* log(x) = k*log(2) + log(c) + log(z/c).  */
+
+  sv_u64_t idx = svmul_n_u64_x (pg, i, 2);
+  sv_f64_t invc = sv_lookup_f64_x (pg, &__v_log10_data.tab[0].invc, idx);
+  sv_f64_t logc = sv_lookup_f64_x (pg, &__v_log10_data.tab[0].log10c, idx);
+
+  /* We approximate log(z/c) with a polynomial P(x) ~= log(x + 1):
+     r = z/c - 1 (we look up precomputed 1/c)
+     log(z/c) ~= P(r).  */
+  sv_f64_t r = sv_fma_f64_x (pg, z, invc, sv_f64 (-1.0));
+
+  /* hi = log(c) + k*log(2).  */
+  sv_f64_t w = sv_fma_n_f64_x (pg, __v_log10_data.invln10, r, logc);
+  sv_f64_t hi = sv_fma_n_f64_x (pg, __v_log10_data.log10_2, k, w);
+
+  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+  sv_f64_t r2 = svmul_f64_x (pg, r, r);
+  sv_f64_t y = sv_fma_n_f64_x (pg, A (3), r, sv_f64 (A (2)));
+  sv_f64_t p = sv_fma_n_f64_x (pg, A (1), r, sv_f64 (A (0)));
+  y = sv_fma_n_f64_x (pg, A (4), r2, y);
+  y = sv_fma_f64_x (pg, y, r2, p);
+  y = sv_fma_f64_x (pg, y, r2, hi);
+
+  if (unlikely (svptest_any (pg, is_special_case)))
+    {
+      return specialcase (x, y, is_special_case);
+    }
+  return y;
+}
+
+PL_ALIAS (__sv_log10_x, _ZGVsMxv_log10)
+
+PL_SIG (SV, D, 1, log10, 0.01, 11.1)
+PL_TEST_ULP (__sv_log10, 1.97)
+PL_TEST_INTERVAL (__sv_log10, -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (__sv_log10, 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (__sv_log10, 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (__sv_log10, 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (__sv_log10, 1.0, 100, 50000)
+PL_TEST_INTERVAL (__sv_log10, 100, inf, 50000)
+#endif
diff --git a/pl/math/sv_log10f_3u5.c b/pl/math/sv_log10f_3u5.c
new file mode 100644
index 0000000..e7b1e98
--- /dev/null
+++ b/pl/math/sv_log10f_3u5.c
@@ -0,0 +1,88 @@
+/*
+ * Single-precision SVE log10 function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define SpecialCaseMin 0x00800000
+#define SpecialCaseMax 0x7f800000
+#define Offset 0x3f2aaaab /* 0.666667.  */
+#define Mask 0x007fffff
+#define Ln2 0x1.62e43p-1f /* 0x3f317218.  */
+#define InvLn10 0x1.bcb7b2p-2f
+
+#define P(i) __v_log10f_poly[i]
+
+static NOINLINE sv_f32_t
+special_case (sv_f32_t x, sv_f32_t y, svbool_t special)
+{
+  return sv_call_f32 (log10f, x, y, special);
+}
+
+/* Optimised implementation of SVE log10f using the same algorithm and
+   polynomial as v_log10f. Maximum error is 3.31ulps:
+   __sv_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4
+			     want 0x1.ffe2f4p-4.  */
+sv_f32_t
+__sv_log10f_x (sv_f32_t x, const svbool_t pg)
+{
+  sv_u32_t ix = sv_as_u32_f32 (x);
+  svbool_t special_cases
+    = svcmpge_n_u32 (pg, svsub_n_u32_x (pg, ix, SpecialCaseMin),
+		     SpecialCaseMax - SpecialCaseMin);
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  ix = svsub_n_u32_x (pg, ix, Offset);
+  sv_f32_t n = sv_to_f32_s32_x (pg, svasr_n_s32_x (pg, sv_as_s32_u32 (ix),
+						   23)); /* signextend.  */
+  ix = svand_n_u32_x (pg, ix, Mask);
+  ix = svadd_n_u32_x (pg, ix, Offset);
+  sv_f32_t r = svsub_n_f32_x (pg, sv_as_f32_u32 (ix), 1.0f);
+
+  /* y = log10(1+r) + n*log10(2)
+     log10(1+r) ~ r * InvLn(10) + P(r)
+     where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for
+     log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3)
+
+     P(r) = r2 * (Q01 + r2 * (Q23 + r2 * (Q45 + r2 * Q67)))
+     and Qij  = Pi + r * Pj.  */
+  sv_f32_t q12 = sv_fma_n_f32_x (pg, P (1), r, sv_f32 (P (0)));
+  sv_f32_t q34 = sv_fma_n_f32_x (pg, P (3), r, sv_f32 (P (2)));
+  sv_f32_t q56 = sv_fma_n_f32_x (pg, P (5), r, sv_f32 (P (4)));
+  sv_f32_t q78 = sv_fma_n_f32_x (pg, P (7), r, sv_f32 (P (6)));
+
+  sv_f32_t r2 = svmul_f32_x (pg, r, r);
+  sv_f32_t y = sv_fma_f32_x (pg, q78, r2, q56);
+  y = sv_fma_f32_x (pg, y, r2, q34);
+  y = sv_fma_f32_x (pg, y, r2, q12);
+
+  /* Using p = Log10(2)*n + r*InvLn(10) is slightly faster but less
+     accurate.  */
+  sv_f32_t p = sv_fma_n_f32_x (pg, Ln2, n, r);
+  y = sv_fma_f32_x (pg, y, r2, svmul_n_f32_x (pg, p, InvLn10));
+
+  if (unlikely (svptest_any (pg, special_cases)))
+    {
+      return special_case (x, y, special_cases);
+    }
+  return y;
+}
+
+PL_ALIAS (__sv_log10f_x, _ZGVsMxv_log10f)
+
+PL_SIG (SV, F, 1, log10, 0.01, 11.1)
+PL_TEST_ULP (__sv_log10f, 2.82)
+PL_TEST_INTERVAL (__sv_log10f, -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (__sv_log10f, 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (__sv_log10f, 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (__sv_log10f, 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (__sv_log10f, 1.0, 100, 50000)
+PL_TEST_INTERVAL (__sv_log10f, 100, inf, 50000)
+#endif
diff --git a/pl/math/sv_log2_3u.c b/pl/math/sv_log2_3u.c
new file mode 100644
index 0000000..a0815bb
--- /dev/null
+++ b/pl/math/sv_log2_3u.c
@@ -0,0 +1,85 @@
+/*
+ * Double-precision SVE log2 function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define InvLn2 sv_f64 (0x1.71547652b82fep0)
+#define N (1 << V_LOG2_TABLE_BITS)
+#define OFF 0x3fe6900900000000
+#define P(i) sv_f64 (__v_log2_data.poly[i])
+
+NOINLINE static sv_f64_t
+specialcase (sv_f64_t x, sv_f64_t y, const svbool_t cmp)
+{
+  return sv_call_f64 (log2, x, y, cmp);
+}
+
+/* Double-precision SVE log2 routine. Implements the same algorithm as vector
+   log10, with coefficients and table entries scaled in extended precision.
+   The maximum observed error is 2.58 ULP:
+   __v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5
+				 want 0x1.fffb34198d9ddp-5.  */
+sv_f64_t
+__sv_log2_x (sv_f64_t x, const svbool_t pg)
+{
+  sv_u64_t ix = sv_as_u64_f64 (x);
+  sv_u64_t top = svlsr_n_u64_x (pg, ix, 48);
+
+  svbool_t special
+    = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, top, 0x0010), 0x7ff0 - 0x0010);
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  sv_u64_t tmp = svsub_n_u64_x (pg, ix, OFF);
+  sv_u64_t i
+    = sv_mod_n_u64_x (pg, svlsr_n_u64_x (pg, tmp, 52 - V_LOG2_TABLE_BITS), N);
+  sv_f64_t k
+    = sv_to_f64_s64_x (pg, svasr_n_s64_x (pg, sv_as_s64_u64 (tmp), 52));
+  sv_f64_t z = sv_as_f64_u64 (
+    svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52)));
+
+  sv_u64_t idx = svmul_n_u64_x (pg, i, 2);
+  sv_f64_t invc = sv_lookup_f64_x (pg, &__v_log2_data.tab[0].invc, idx);
+  sv_f64_t log2c = sv_lookup_f64_x (pg, &__v_log2_data.tab[0].log2c, idx);
+
+  /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k.  */
+
+  sv_f64_t r = sv_fma_f64_x (pg, z, invc, sv_f64 (-1.0));
+  sv_f64_t w = sv_fma_f64_x (pg, r, InvLn2, log2c);
+
+  sv_f64_t r2 = svmul_f64_x (pg, r, r);
+  sv_f64_t p_23 = sv_fma_f64_x (pg, P (3), r, P (2));
+  sv_f64_t p_01 = sv_fma_f64_x (pg, P (1), r, P (0));
+  sv_f64_t y = sv_fma_f64_x (pg, P (4), r2, p_23);
+  y = sv_fma_f64_x (pg, y, r2, p_01);
+  y = sv_fma_f64_x (pg, y, r2, svadd_f64_x (pg, k, w));
+
+  if (unlikely (svptest_any (pg, special)))
+    {
+      return specialcase (x, y, special);
+    }
+  return y;
+}
+
+PL_ALIAS (__sv_log2_x, _ZGVsMxv_log2)
+
+PL_SIG (SV, D, 1, log2, 0.01, 11.1)
+PL_TEST_ULP (__sv_log2, 2.09)
+PL_TEST_EXPECT_FENV_ALWAYS (__sv_log2)
+PL_TEST_INTERVAL (__sv_log2, -0.0, -0x1p126, 1000)
+PL_TEST_INTERVAL (__sv_log2, 0.0, 0x1p-126, 4000)
+PL_TEST_INTERVAL (__sv_log2, 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (__sv_log2, 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (__sv_log2, 1.0, 100, 50000)
+PL_TEST_INTERVAL (__sv_log2, 100, inf, 50000)
+
+#endif
diff --git a/pl/math/sv_log2f_2u5.c b/pl/math/sv_log2f_2u5.c
new file mode 100644
index 0000000..fe2ab16
--- /dev/null
+++ b/pl/math/sv_log2f_2u5.c
@@ -0,0 +1,79 @@
+/*
+ * Single-precision vector/SVE log2 function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define P(i) __v_log2f_data.poly[i]
+
+#define Ln2 (0x1.62e43p-1f) /* 0x3f317218.  */
+#define Min (0x00800000)
+#define Max (0x7f800000)
+#define Mask (0x007fffff)
+#define Off (0x3f2aaaab) /* 0.666667.  */
+
+static NOINLINE sv_f32_t
+specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+  return sv_call_f32 (log2f, x, y, cmp);
+}
+
+/* Optimised implementation of SVE log2f, using the same algorithm
+   and polynomial as Neon log2f. Maximum error is 2.48 ULPs:
+   __sv_log2f(0x1.558174p+0) got 0x1.a9be84p-2
+			    want 0x1.a9be8p-2.  */
+sv_f32_t
+__sv_log2f_x (sv_f32_t x, const svbool_t pg)
+{
+  sv_u32_t u = sv_as_u32_f32 (x);
+  svbool_t special
+    = svcmpge_u32 (pg, svsub_n_u32_x (pg, u, Min), sv_u32 (Max - Min));
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  u = svsub_n_u32_x (pg, u, Off);
+  sv_f32_t n = sv_to_f32_s32_x (pg, svasr_n_s32_x (pg, sv_as_s32_u32 (u),
+						   23)); /* Sign-extend.  */
+  u = svand_n_u32_x (pg, u, Mask);
+  u = svadd_n_u32_x (pg, u, Off);
+  sv_f32_t r = svsub_n_f32_x (pg, sv_as_f32_u32 (u), 1.0f);
+
+  /* y = log2(1+r) + n.  */
+  sv_f32_t r2 = svmul_f32_x (pg, r, r);
+
+  /* Evaluate polynomial using pairwise Horner scheme.  */
+  sv_f32_t p67 = sv_fma_n_f32_x (pg, P (7), r, sv_f32 (P (6)));
+  sv_f32_t p45 = sv_fma_n_f32_x (pg, P (5), r, sv_f32 (P (4)));
+  sv_f32_t p23 = sv_fma_n_f32_x (pg, P (3), r, sv_f32 (P (2)));
+  sv_f32_t p01 = sv_fma_n_f32_x (pg, P (1), r, sv_f32 (P (0)));
+  sv_f32_t y;
+  y = sv_fma_n_f32_x (pg, P (8), r2, p67);
+  y = sv_fma_f32_x (pg, y, r2, p45);
+  y = sv_fma_f32_x (pg, y, r2, p23);
+  y = sv_fma_f32_x (pg, y, r2, p01);
+  y = sv_fma_f32_x (pg, y, r, n);
+
+  if (unlikely (svptest_any (pg, special)))
+    return specialcase (x, y, special);
+  return y;
+}
+
+PL_ALIAS (__sv_log2f_x, _ZGVsMxv_log2f)
+
+PL_SIG (SV, F, 1, log2, 0.01, 11.1)
+PL_TEST_ULP (__sv_log2f, 1.99)
+PL_TEST_EXPECT_FENV_ALWAYS (__sv_log2f)
+PL_TEST_INTERVAL (__sv_log2f, -0.0, -0x1p126, 4000)
+PL_TEST_INTERVAL (__sv_log2f, 0.0, 0x1p-126, 4000)
+PL_TEST_INTERVAL (__sv_log2f, 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (__sv_log2f, 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (__sv_log2f, 1.0, 100, 50000)
+PL_TEST_INTERVAL (__sv_log2f, 100, inf, 50000)
+
+#endif // SV_SUPPORTED
diff --git a/pl/math/sv_log_2u5.c b/pl/math/sv_log_2u5.c
new file mode 100644
index 0000000..7f06fd3
--- /dev/null
+++ b/pl/math/sv_log_2u5.c
@@ -0,0 +1,85 @@
+/*
+ * Double-precision SVE log(x) function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define A(i) __sv_log_data.poly[i]
+#define Ln2 (0x1.62e42fefa39efp-1)
+#define N (1 << SV_LOG_TABLE_BITS)
+#define OFF (0x3fe6900900000000)
+
+double
+optr_aor_log_f64 (double);
+
+static NOINLINE sv_f64_t
+__sv_log_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp)
+{
+  return sv_call_f64 (optr_aor_log_f64, x, y, cmp);
+}
+
+/* SVE port of Neon log algorithm from math/.
+   Maximum measured error is 2.17 ulp:
+   __sv_log(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2
+				 want 0x1.ffffff1cca045p-2.  */
+sv_f64_t
+__sv_log_x (sv_f64_t x, const svbool_t pg)
+{
+  sv_u64_t ix = sv_as_u64_f64 (x);
+  sv_u64_t top = svlsr_n_u64_x (pg, ix, 48);
+  svbool_t cmp = svcmpge_u64 (pg, svsub_n_u64_x (pg, top, 0x0010),
+			      sv_u64 (0x7ff0 - 0x0010));
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  sv_u64_t tmp = svsub_n_u64_x (pg, ix, OFF);
+  /* Equivalent to (tmp >> (52 - SV_LOG_TABLE_BITS)) % N, since N is a power
+     of 2.  */
+  sv_u64_t i
+    = svand_n_u64_x (pg, svlsr_n_u64_x (pg, tmp, (52 - SV_LOG_TABLE_BITS)),
+		     N - 1);
+  sv_s64_t k
+    = svasr_n_s64_x (pg, sv_as_s64_u64 (tmp), 52); /* Arithmetic shift.  */
+  sv_u64_t iz = svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52));
+  sv_f64_t z = sv_as_f64_u64 (iz);
+  /* Lookup in 2 global lists (length N).  */
+  sv_f64_t invc = sv_lookup_f64_x (pg, __sv_log_data.invc, i);
+  sv_f64_t logc = sv_lookup_f64_x (pg, __sv_log_data.logc, i);
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+  sv_f64_t r = sv_fma_f64_x (pg, z, invc, sv_f64 (-1.0));
+  sv_f64_t kd = sv_to_f64_s64_x (pg, k);
+  /* hi = r + log(c) + k*Ln2.  */
+  sv_f64_t hi = sv_fma_n_f64_x (pg, Ln2, kd, svadd_f64_x (pg, logc, r));
+  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+  sv_f64_t r2 = svmul_f64_x (pg, r, r);
+  sv_f64_t y = sv_fma_n_f64_x (pg, A (3), r, sv_f64 (A (2)));
+  sv_f64_t p = sv_fma_n_f64_x (pg, A (1), r, sv_f64 (A (0)));
+  y = sv_fma_n_f64_x (pg, A (4), r2, y);
+  y = sv_fma_f64_x (pg, y, r2, p);
+  y = sv_fma_f64_x (pg, y, r2, hi);
+
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_log_specialcase (x, y, cmp);
+  return y;
+}
+
+PL_ALIAS (__sv_log_x, _ZGVsMxv_log)
+
+PL_SIG (SV, D, 1, log, 0.01, 11.1)
+PL_TEST_ULP (__sv_log, 1.68)
+PL_TEST_INTERVAL (__sv_log, -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (__sv_log, 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (__sv_log, 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (__sv_log, 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (__sv_log, 1.0, 100, 50000)
+PL_TEST_INTERVAL (__sv_log, 100, inf, 50000)
+#endif // SV_SUPPORTED
diff --git a/pl/math/sv_log_data.c b/pl/math/sv_log_data.c
new file mode 100644
index 0000000..77f9989
--- /dev/null
+++ b/pl/math/sv_log_data.c
@@ -0,0 +1,146 @@
+/*
+ * Coefficients for double-precision SVE log(x) function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct sv_log_data __sv_log_data = {
+  /* All coefficients and table entries are copied from the Neon routine in
+     math/. See math/v_log_data.c for an explanation of the algorithm.  */
+
+  .invc = {0x1.6a133d0dec120p+0, 0x1.6815f2f3e42edp+0,
+	   0x1.661e39be1ac9ep+0, 0x1.642bfa30ac371p+0,
+	   0x1.623f1d916f323p+0, 0x1.60578da220f65p+0,
+	   0x1.5e75349dea571p+0, 0x1.5c97fd387a75ap+0,
+	   0x1.5abfd2981f200p+0, 0x1.58eca051dc99cp+0,
+	   0x1.571e526d9df12p+0, 0x1.5554d555b3fcbp+0,
+	   0x1.539015e2a20cdp+0, 0x1.51d0014ee0164p+0,
+	   0x1.50148538cd9eep+0, 0x1.4e5d8f9f698a1p+0,
+	   0x1.4cab0edca66bep+0, 0x1.4afcf1a9db874p+0,
+	   0x1.495327136e16fp+0, 0x1.47ad9e84af28fp+0,
+	   0x1.460c47b39ae15p+0, 0x1.446f12b278001p+0,
+	   0x1.42d5efdd720ecp+0, 0x1.4140cfe001a0fp+0,
+	   0x1.3fafa3b421f69p+0, 0x1.3e225c9c8ece5p+0,
+	   0x1.3c98ec29a211ap+0, 0x1.3b13442a413fep+0,
+	   0x1.399156baa3c54p+0, 0x1.38131639b4cdbp+0,
+	   0x1.36987540fbf53p+0, 0x1.352166b648f61p+0,
+	   0x1.33adddb3eb575p+0, 0x1.323dcd99fc1d3p+0,
+	   0x1.30d129fefc7d2p+0, 0x1.2f67e6b72fe7dp+0,
+	   0x1.2e01f7cf8b187p+0, 0x1.2c9f518ddc86ep+0,
+	   0x1.2b3fe86e5f413p+0, 0x1.29e3b1211b25cp+0,
+	   0x1.288aa08b373cfp+0, 0x1.2734abcaa8467p+0,
+	   0x1.25e1c82459b81p+0, 0x1.2491eb1ad59c5p+0,
+	   0x1.23450a54048b5p+0, 0x1.21fb1bb09e578p+0,
+	   0x1.20b415346d8f7p+0, 0x1.1f6fed179a1acp+0,
+	   0x1.1e2e99b93c7b3p+0, 0x1.1cf011a7a882ap+0,
+	   0x1.1bb44b97dba5ap+0, 0x1.1a7b3e66cdd4fp+0,
+	   0x1.1944e11dc56cdp+0, 0x1.18112aebb1a6ep+0,
+	   0x1.16e013231b7e9p+0, 0x1.15b1913f156cfp+0,
+	   0x1.14859cdedde13p+0, 0x1.135c2dc68cfa4p+0,
+	   0x1.12353bdb01684p+0, 0x1.1110bf25b85b4p+0,
+	   0x1.0feeafd2f8577p+0, 0x1.0ecf062c51c3bp+0,
+	   0x1.0db1baa076c8bp+0, 0x1.0c96c5bb3048ep+0,
+	   0x1.0b7e20263e070p+0, 0x1.0a67c2acd0ce3p+0,
+	   0x1.0953a6391e982p+0, 0x1.0841c3caea380p+0,
+	   0x1.07321489b13eap+0, 0x1.062491aee9904p+0,
+	   0x1.05193497a7cc5p+0, 0x1.040ff6b5f5e9fp+0,
+	   0x1.0308d19aa6127p+0, 0x1.0203beedb0c67p+0,
+	   0x1.010037d38bcc2p+0, 1.0,
+	   0x1.fc06d493cca10p-1, 0x1.f81e6ac3b918fp-1,
+	   0x1.f44546ef18996p-1, 0x1.f07b10382c84bp-1,
+	   0x1.ecbf7070e59d4p-1, 0x1.e91213f715939p-1,
+	   0x1.e572a9a75f7b7p-1, 0x1.e1e0e2c530207p-1,
+	   0x1.de5c72d8a8be3p-1, 0x1.dae50fa5658ccp-1,
+	   0x1.d77a71145a2dap-1, 0x1.d41c51166623ep-1,
+	   0x1.d0ca6ba0bb29fp-1, 0x1.cd847e8e59681p-1,
+	   0x1.ca4a499693e00p-1, 0x1.c71b8e399e821p-1,
+	   0x1.c3f80faf19077p-1, 0x1.c0df92dc2b0ecp-1,
+	   0x1.bdd1de3cbb542p-1, 0x1.baceb9e1007a3p-1,
+	   0x1.b7d5ef543e55ep-1, 0x1.b4e749977d953p-1,
+	   0x1.b20295155478ep-1, 0x1.af279f8e82be2p-1,
+	   0x1.ac5638197fdf3p-1, 0x1.a98e2f102e087p-1,
+	   0x1.a6cf5606d05c1p-1, 0x1.a4197fc04d746p-1,
+	   0x1.a16c80293dc01p-1, 0x1.9ec82c4dc5bc9p-1,
+	   0x1.9c2c5a491f534p-1, 0x1.9998e1480b618p-1,
+	   0x1.970d9977c6c2dp-1, 0x1.948a5c023d212p-1,
+	   0x1.920f0303d6809p-1, 0x1.8f9b698a98b45p-1,
+	   0x1.8d2f6b81726f6p-1, 0x1.8acae5bb55badp-1,
+	   0x1.886db5d9275b8p-1, 0x1.8617ba567c13cp-1,
+	   0x1.83c8d27487800p-1, 0x1.8180de3c5dbe7p-1,
+	   0x1.7f3fbe71cdb71p-1, 0x1.7d055498071c1p-1,
+	   0x1.7ad182e54f65ap-1, 0x1.78a42c3c90125p-1,
+	   0x1.767d342f76944p-1, 0x1.745c7ef26b00ap-1,
+	   0x1.7241f15769d0fp-1, 0x1.702d70d396e41p-1,
+	   0x1.6e1ee3700cd11p-1, 0x1.6c162fc9cbe02p-1},
+
+  .logc = {-0x1.62fe995eb963ap-2, -0x1.5d5a48dad6b67p-2,
+	   -0x1.57bde257d2769p-2, -0x1.52294fbf2af55p-2,
+	   -0x1.4c9c7b598aa38p-2, -0x1.47174fc5ff560p-2,
+	   -0x1.4199b7fa7b5cap-2, -0x1.3c239f48cfb99p-2,
+	   -0x1.36b4f154d2aebp-2, -0x1.314d9a0ff32fbp-2,
+	   -0x1.2bed85cca3cffp-2, -0x1.2694a11421af9p-2,
+	   -0x1.2142d8d014fb2p-2, -0x1.1bf81a2c77776p-2,
+	   -0x1.16b452a39c6a4p-2, -0x1.11776ffa6c67ep-2,
+	   -0x1.0c416035020e0p-2, -0x1.071211aa10fdap-2,
+	   -0x1.01e972e293b1bp-2, -0x1.f98ee587fd434p-3,
+	   -0x1.ef5800ad716fbp-3, -0x1.e52e160484698p-3,
+	   -0x1.db1104b19352ep-3, -0x1.d100ac59e0bd6p-3,
+	   -0x1.c6fced287c3bdp-3, -0x1.bd05a7b317c29p-3,
+	   -0x1.b31abd229164fp-3, -0x1.a93c0edadb0a3p-3,
+	   -0x1.9f697ee30d7ddp-3, -0x1.95a2efa9aa40ap-3,
+	   -0x1.8be843d796044p-3, -0x1.82395ecc477edp-3,
+	   -0x1.7896240966422p-3, -0x1.6efe77aca8c55p-3,
+	   -0x1.65723e117ec5cp-3, -0x1.5bf15c0955706p-3,
+	   -0x1.527bb6c111da1p-3, -0x1.491133c939f8fp-3,
+	   -0x1.3fb1b90c7fc58p-3, -0x1.365d2cc485f8dp-3,
+	   -0x1.2d13758970de7p-3, -0x1.23d47a721fd47p-3,
+	   -0x1.1aa0229f25ec2p-3, -0x1.117655ddebc3bp-3,
+	   -0x1.0856fbf83ab6bp-3, -0x1.fe83fabbaa106p-4,
+	   -0x1.ec6e8507a56cdp-4, -0x1.da6d68c7cc2eap-4,
+	   -0x1.c88078462be0cp-4, -0x1.b6a786a423565p-4,
+	   -0x1.a4e2676ac7f85p-4, -0x1.9330eea777e76p-4,
+	   -0x1.8192f134d5ad9p-4, -0x1.70084464f0538p-4,
+	   -0x1.5e90bdec5cb1fp-4, -0x1.4d2c3433c5536p-4,
+	   -0x1.3bda7e219879ap-4, -0x1.2a9b732d27194p-4,
+	   -0x1.196eeb2b10807p-4, -0x1.0854be8ef8a7ep-4,
+	   -0x1.ee998cb277432p-5, -0x1.ccadb79919fb9p-5,
+	   -0x1.aae5b1d8618b0p-5, -0x1.89413015d7442p-5,
+	   -0x1.67bfe7bf158dep-5, -0x1.46618f83941bep-5,
+	   -0x1.2525df1b0618ap-5, -0x1.040c8e2f77c6ap-5,
+	   -0x1.c62aad39f738ap-6, -0x1.847fe3bdead9cp-6,
+	   -0x1.43183683400acp-6, -0x1.01f31c4e1d544p-6,
+	   -0x1.82201d1e6b69ap-7, -0x1.00dd0f3e1bfd6p-7,
+	   -0x1.ff6fe1feb4e53p-9, 0.0,
+	   0x1.fe91885ec8e20p-8,  0x1.fc516f716296dp-7,
+	   0x1.7bb4dd70a015bp-6,  0x1.f84c99b34b674p-6,
+	   0x1.39f9ce4fb2d71p-5,  0x1.7756c0fd22e78p-5,
+	   0x1.b43ee82db8f3ap-5,  0x1.f0b3fced60034p-5,
+	   0x1.165bd78d4878ep-4,  0x1.3425d2715ebe6p-4,
+	   0x1.51b8bd91b7915p-4,  0x1.6f15632c76a47p-4,
+	   0x1.8c3c88ecbe503p-4,  0x1.a92ef077625dap-4,
+	   0x1.c5ed5745fa006p-4,  0x1.e27876de1c993p-4,
+	   0x1.fed104fce4cdcp-4,  0x1.0d7bd9c17d78bp-3,
+	   0x1.1b76986cef97bp-3,  0x1.295913d24f750p-3,
+	   0x1.37239fa295d17p-3,  0x1.44d68dd78714bp-3,
+	   0x1.52722ebe5d780p-3,  0x1.5ff6d12671f98p-3,
+	   0x1.6d64c2389484bp-3,  0x1.7abc4da40fddap-3,
+	   0x1.87fdbda1e8452p-3,  0x1.95295b06a5f37p-3,
+	   0x1.a23f6d34abbc5p-3,  0x1.af403a28e04f2p-3,
+	   0x1.bc2c06a85721ap-3,  0x1.c903161240163p-3,
+	   0x1.d5c5aa93287ebp-3,  0x1.e274051823fa9p-3,
+	   0x1.ef0e656300c16p-3,  0x1.fb9509f05aa2ap-3,
+	   0x1.04041821f37afp-2,  0x1.0a340a49b3029p-2,
+	   0x1.105a7918a126dp-2,  0x1.1677819812b84p-2,
+	   0x1.1c8b405b40c0ep-2,  0x1.2295d16cfa6b1p-2,
+	   0x1.28975066318a2p-2,  0x1.2e8fd855d86fcp-2,
+	   0x1.347f83d605e59p-2,  0x1.3a666d1244588p-2,
+	   0x1.4044adb6f8ec4p-2,  0x1.461a5f077558cp-2,
+	   0x1.4be799e20b9c8p-2,  0x1.51ac76a6b79dfp-2,
+	   0x1.57690d5744a45p-2,  0x1.5d1d758e45217p-2},
+
+  .poly = {-0x1.ffffffffffff7p-2, 0x1.55555555170d4p-2, -0x1.0000000399c27p-2,
+	   0x1.999b2e90e94cap-3, -0x1.554e550bd501ep-3},
+};
diff --git a/pl/math/sv_logf_3u4.c b/pl/math/sv_logf_3u4.c
new file mode 100644
index 0000000..11f0b8a
--- /dev/null
+++ b/pl/math/sv_logf_3u4.c
@@ -0,0 +1,77 @@
+/*
+ * Single-precision vector log function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define P(i) __sv_logf_poly[i]
+
+#define Ln2 (0x1.62e43p-1f) /* 0x3f317218 */
+#define Min (0x00800000)
+#define Max (0x7f800000)
+#define Mask (0x007fffff)
+#define Off (0x3f2aaaab) /* 0.666667 */
+
+float
+optr_aor_log_f32 (float);
+
+static NOINLINE sv_f32_t
+__sv_logf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+  return sv_call_f32 (optr_aor_log_f32, x, y, cmp);
+}
+
+/* Optimised implementation of SVE logf, using the same algorithm and polynomial
+   as the Neon routine in math/. Maximum error is 3.34 ULPs:
+   __sv_logf(0x1.557298p+0) got 0x1.26edecp-2
+			   want 0x1.26ede6p-2.  */
+sv_f32_t
+__sv_logf_x (sv_f32_t x, const svbool_t pg)
+{
+  sv_u32_t u = sv_as_u32_f32 (x);
+  svbool_t cmp
+    = svcmpge_u32 (pg, svsub_n_u32_x (pg, u, Min), sv_u32 (Max - Min));
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  u = svsub_n_u32_x (pg, u, Off);
+  sv_f32_t n = sv_to_f32_s32_x (pg, svasr_n_s32_x (pg, sv_as_s32_u32 (u),
+						   23)); /* Sign-extend.  */
+  u = svand_n_u32_x (pg, u, Mask);
+  u = svadd_n_u32_x (pg, u, Off);
+  sv_f32_t r = svsub_n_f32_x (pg, sv_as_f32_u32 (u), 1.0f);
+
+  /* y = log(1+r) + n*ln2.  */
+  sv_f32_t r2 = svmul_f32_x (pg, r, r);
+  /* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))).  */
+  sv_f32_t p = sv_fma_n_f32_x (pg, P (1), r, sv_f32 (P (2)));
+  sv_f32_t q = sv_fma_n_f32_x (pg, P (3), r, sv_f32 (P (4)));
+  sv_f32_t y = sv_fma_n_f32_x (pg, P (5), r, sv_f32 (P (6)));
+  p = sv_fma_n_f32_x (pg, P (0), r2, p);
+  q = sv_fma_f32_x (pg, p, r2, q);
+  y = sv_fma_f32_x (pg, q, r2, y);
+  p = sv_fma_n_f32_x (pg, Ln2, n, r);
+  y = sv_fma_f32_x (pg, y, r2, p);
+
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_logf_specialcase (x, y, cmp);
+  return y;
+}
+
+PL_ALIAS (__sv_logf_x, _ZGVsMxv_logf)
+
+PL_SIG (SV, F, 1, log, 0.01, 11.1)
+PL_TEST_ULP (__sv_logf, 2.85)
+PL_TEST_INTERVAL (__sv_logf, -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (__sv_logf, 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (__sv_logf, 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (__sv_logf, 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (__sv_logf, 1.0, 100, 50000)
+PL_TEST_INTERVAL (__sv_logf, 100, inf, 50000)
+#endif // SV_SUPPORTED
diff --git a/pl/math/sv_logf_data.c b/pl/math/sv_logf_data.c
new file mode 100644
index 0000000..51dd7a7
--- /dev/null
+++ b/pl/math/sv_logf_data.c
@@ -0,0 +1,12 @@
+/*
+ * Coefficients for single-precision SVE log function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+const float __sv_logf_poly[] = {
+  /* Copied from coeffs for the Neon routine in math/.  */
+  -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, -0x1.4f9934p-3f, 0x1.961348p-3f,
+  -0x1.00187cp-2f, 0x1.555d7cp-2f, -0x1.ffffc8p-2f,
+};
diff --git a/pl/math/sv_math.h b/pl/math/sv_math.h
new file mode 100644
index 0000000..5ef0ad3
--- /dev/null
+++ b/pl/math/sv_math.h
@@ -0,0 +1,245 @@
+/*
+ * Wrapper functions for SVE ACLE.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef SV_MATH_H
+#define SV_MATH_H
+
+#ifndef WANT_VMATH
+/* Enable the build of vector math code.  */
+#define WANT_VMATH 1
+#endif
+#if WANT_VMATH
+
+#if WANT_SVE_MATH
+#define SV_SUPPORTED 1
+
+#include <arm_sve.h>
+#include <stdbool.h>
+
+#include "math_config.h"
+
+typedef float f32_t;
+typedef uint32_t u32_t;
+typedef int32_t s32_t;
+typedef double f64_t;
+typedef uint64_t u64_t;
+typedef int64_t s64_t;
+
+typedef svfloat64_t sv_f64_t;
+typedef svuint64_t sv_u64_t;
+typedef svint64_t sv_s64_t;
+
+typedef svfloat32_t sv_f32_t;
+typedef svuint32_t sv_u32_t;
+typedef svint32_t sv_s32_t;
+
+/* Double precision.  */
+static inline sv_s64_t
+sv_s64 (s64_t x)
+{
+  return svdup_n_s64 (x);
+}
+
+static inline sv_u64_t
+sv_u64 (u64_t x)
+{
+  return svdup_n_u64 (x);
+}
+
+static inline sv_f64_t
+sv_f64 (f64_t x)
+{
+  return svdup_n_f64 (x);
+}
+
+static inline sv_f64_t
+sv_fma_f64_x (svbool_t pg, sv_f64_t x, sv_f64_t y, sv_f64_t z)
+{
+  return svmla_f64_x (pg, z, x, y);
+}
+
+/* res = z + x * y with x scalar. */
+static inline sv_f64_t
+sv_fma_n_f64_x (svbool_t pg, f64_t x, sv_f64_t y, sv_f64_t z)
+{
+  return svmla_n_f64_x (pg, z, y, x);
+}
+
+static inline sv_s64_t
+sv_as_s64_u64 (sv_u64_t x)
+{
+  return svreinterpret_s64_u64 (x);
+}
+
+static inline sv_u64_t
+sv_as_u64_f64 (sv_f64_t x)
+{
+  return svreinterpret_u64_f64 (x);
+}
+
+static inline sv_f64_t
+sv_as_f64_u64 (sv_u64_t x)
+{
+  return svreinterpret_f64_u64 (x);
+}
+
+static inline sv_f64_t
+sv_to_f64_s64_x (svbool_t pg, sv_s64_t s)
+{
+  return svcvt_f64_x (pg, s);
+}
+
+static inline sv_f64_t
+sv_call_f64 (f64_t (*f) (f64_t), sv_f64_t x, sv_f64_t y, svbool_t cmp)
+{
+  svbool_t p = svpfirst (cmp, svpfalse ());
+  while (svptest_any (cmp, p))
+    {
+      f64_t elem = svclastb_n_f64 (p, 0, x);
+      elem = (*f) (elem);
+      sv_f64_t y2 = svdup_n_f64 (elem);
+      y = svsel_f64 (p, y2, y);
+      p = svpnext_b64 (cmp, p);
+    }
+  return y;
+}
+
+static inline sv_f64_t
+sv_call2_f64 (f64_t (*f) (f64_t, f64_t), sv_f64_t x1, sv_f64_t x2, sv_f64_t y,
+	      svbool_t cmp)
+{
+  svbool_t p = svpfirst (cmp, svpfalse ());
+  while (svptest_any (cmp, p))
+    {
+      f64_t elem1 = svclastb_n_f64 (p, 0, x1);
+      f64_t elem2 = svclastb_n_f64 (p, 0, x2);
+      f64_t ret = (*f) (elem1, elem2);
+      sv_f64_t y2 = svdup_n_f64 (ret);
+      y = svsel_f64 (p, y2, y);
+      p = svpnext_b64 (cmp, p);
+    }
+  return y;
+}
+
+/* Load array of uint64_t into svuint64_t.  */
+static inline sv_u64_t
+sv_lookup_u64_x (svbool_t pg, const u64_t *tab, sv_u64_t idx)
+{
+  return svld1_gather_u64index_u64 (pg, tab, idx);
+}
+
+/* Load array of double into svfloat64_t.  */
+static inline sv_f64_t
+sv_lookup_f64_x (svbool_t pg, const f64_t *tab, sv_u64_t idx)
+{
+  return svld1_gather_u64index_f64 (pg, tab, idx);
+}
+
+static inline sv_u64_t
+sv_mod_n_u64_x (svbool_t pg, sv_u64_t x, u64_t y)
+{
+  sv_u64_t q = svdiv_n_u64_x (pg, x, y);
+  return svmls_n_u64_x (pg, x, q, y);
+}
+
+/* Single precision.  */
+static inline sv_s32_t
+sv_s32 (s32_t x)
+{
+  return svdup_n_s32 (x);
+}
+
+static inline sv_u32_t
+sv_u32 (u32_t x)
+{
+  return svdup_n_u32 (x);
+}
+
+static inline sv_f32_t
+sv_f32 (f32_t x)
+{
+  return svdup_n_f32 (x);
+}
+
+static inline sv_f32_t
+sv_fma_f32_x (svbool_t pg, sv_f32_t x, sv_f32_t y, sv_f32_t z)
+{
+  return svmla_f32_x (pg, z, x, y);
+}
+
+/* res = z + x * y with x scalar.  */
+static inline sv_f32_t
+sv_fma_n_f32_x (svbool_t pg, f32_t x, sv_f32_t y, sv_f32_t z)
+{
+  return svmla_n_f32_x (pg, z, y, x);
+}
+
+static inline sv_u32_t
+sv_as_u32_f32 (sv_f32_t x)
+{
+  return svreinterpret_u32_f32 (x);
+}
+
+static inline sv_f32_t
+sv_as_f32_u32 (sv_u32_t x)
+{
+  return svreinterpret_f32_u32 (x);
+}
+
+static inline sv_s32_t
+sv_as_s32_u32 (sv_u32_t x)
+{
+  return svreinterpret_s32_u32 (x);
+}
+
+static inline sv_f32_t
+sv_to_f32_s32_x (svbool_t pg, sv_s32_t s)
+{
+  return svcvt_f32_x (pg, s);
+}
+
+static inline sv_s32_t
+sv_to_s32_f32_x (svbool_t pg, sv_f32_t x)
+{
+  return svcvt_s32_f32_x (pg, x);
+}
+
+static inline sv_f32_t
+sv_call_f32 (f32_t (*f) (f32_t), sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+  svbool_t p = svpfirst (cmp, svpfalse ());
+  while (svptest_any (cmp, p))
+    {
+      f32_t elem = svclastb_n_f32 (p, 0, x);
+      elem = (*f) (elem);
+      sv_f32_t y2 = svdup_n_f32 (elem);
+      y = svsel_f32 (p, y2, y);
+      p = svpnext_b32 (cmp, p);
+    }
+  return y;
+}
+
+static inline sv_f32_t
+sv_call2_f32 (f32_t (*f) (f32_t, f32_t), sv_f32_t x1, sv_f32_t x2, sv_f32_t y,
+	      svbool_t cmp)
+{
+  svbool_t p = svpfirst (cmp, svpfalse ());
+  while (svptest_any (cmp, p))
+    {
+      f32_t elem1 = svclastb_n_f32 (p, 0, x1);
+      f32_t elem2 = svclastb_n_f32 (p, 0, x2);
+      f32_t ret = (*f) (elem1, elem2);
+      sv_f32_t y2 = svdup_n_f32 (ret);
+      y = svsel_f32 (p, y2, y);
+      p = svpnext_b32 (cmp, p);
+    }
+  return y;
+}
+
+#endif
+#endif
+#endif
diff --git a/pl/math/sv_powi.c b/pl/math/sv_powi.c
new file mode 100644
index 0000000..1bb0eb3
--- /dev/null
+++ b/pl/math/sv_powi.c
@@ -0,0 +1,53 @@
+/*
+ * Double-precision SVE powi(x, n) function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+/* Optimized double-precision vector powi (double base, long integer power).
+   powi is developed for environments in which accuracy is of much less
+   importance than performance, hence we provide no estimate for worst-case
+   error.  */
+svfloat64_t
+__sv_powi_x (svfloat64_t as, svint64_t ns, svbool_t p)
+{
+  /* Compute powi by successive squaring, right to left.  */
+  svfloat64_t acc = svdup_n_f64 (1.0);
+  svbool_t want_recip = svcmplt_n_s64 (p, ns, 0);
+  svuint64_t ns_abs = svreinterpret_u64_s64 (svabs_s64_x (p, ns));
+
+  /* We use a max to avoid needing to check whether any lane != 0 on each
+     iteration.  */
+  uint64_t max_n = svmaxv_u64 (p, ns_abs);
+
+  svfloat64_t c = as;
+  /* Successively square c, and use merging predication (_m) to determine
+     whether or not to perform the multiplication or keep the previous
+     iteration.  */
+  while (true)
+    {
+      svbool_t px = svcmpeq_n_u64 (p, svand_n_u64_x (p, ns_abs, 1ull), 1ull);
+      acc = svmul_f64_m (px, acc, c);
+      max_n >>= 1;
+      if (max_n == 0)
+	break;
+
+      ns_abs = svlsr_n_u64_x (p, ns_abs, 1);
+      c = svmul_f64_x (p, c, c);
+    }
+
+  /* Negative powers are handled by computing the abs(n) version and then
+     taking the reciprocal.  */
+  if (svptest_any (want_recip, want_recip))
+    acc = svdivr_n_f64_m (want_recip, acc, 1.0);
+
+  return acc;
+}
+
+strong_alias (__sv_powi_x, _ZGVsMxvv_powk)
+
+#endif // SV_SUPPORTED
diff --git a/pl/math/sv_powif.c b/pl/math/sv_powif.c
new file mode 100644
index 0000000..d0567e3
--- /dev/null
+++ b/pl/math/sv_powif.c
@@ -0,0 +1,54 @@
+/*
+ * Single-precision SVE powi(x, n) function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#if SV_SUPPORTED
+
+/* Optimized single-precision vector powi (float base, integer power).
+   powi is developed for environments in which accuracy is of much less
+   importance than performance, hence we provide no estimate for worst-case
+   error.  */
+svfloat32_t
+__sv_powif_x (svfloat32_t as, svint32_t ns, svbool_t p)
+{
+  /* Compute powi by successive squaring, right to left.  */
+  svfloat32_t acc = svdup_n_f32 (1.f);
+  svbool_t want_recip = svcmplt_n_s32 (p, ns, 0);
+  svuint32_t ns_abs = svreinterpret_u32_s32 (svabs_s32_x (p, ns));
+
+  /* We use a max to avoid needing to check whether any lane != 0 on each
+     iteration.  */
+  uint32_t max_n = svmaxv_u32 (p, ns_abs);
+
+  svfloat32_t c = as;
+  /* Successively square c, and use merging predication (_m) to determine
+     whether or not to perform the multiplication or keep the previous
+     iteration.  */
+  while (true)
+    {
+      svbool_t px = svcmpeq_n_u32 (p, svand_n_u32_x (p, ns_abs, 1), 1);
+      acc = svmul_f32_m (px, acc, c);
+      max_n >>= 1;
+      if (max_n == 0)
+	break;
+
+      ns_abs = svlsr_n_u32_x (p, ns_abs, 1);
+      c = svmul_f32_x (p, c, c);
+    }
+
+  /* Negative powers are handled by computing the abs(n) version and then
+     taking the reciprocal.  */
+  if (svptest_any (want_recip, want_recip))
+    acc = svdivr_n_f32_m (want_recip, acc, 1.0f);
+
+  return acc;
+}
+
+/* Note no trailing f for ZGV... name - 64-bit integer version is powk.  */
+strong_alias (__sv_powif_x, _ZGVsMxvv_powi)
+
+#endif // SV_SUPPORTED
diff --git a/pl/math/sv_sin_3u.c b/pl/math/sv_sin_3u.c
new file mode 100644
index 0000000..3fee080
--- /dev/null
+++ b/pl/math/sv_sin_3u.c
@@ -0,0 +1,89 @@
+/*
+ * Double-precision SVE sin(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define InvPi (sv_f64 (0x1.45f306dc9c883p-2))
+#define HalfPi (sv_f64 (0x1.921fb54442d18p+0))
+#define InvPio2 (sv_f64 (0x1.45f306dc9c882p-1))
+#define NegPio2_1 (sv_f64 (-0x1.921fb50000000p+0))
+#define NegPio2_2 (sv_f64 (-0x1.110b460000000p-26))
+#define NegPio2_3 (sv_f64 (-0x1.1a62633145c07p-54))
+#define Shift (sv_f64 (0x1.8p52))
+#define RangeVal (sv_f64 (0x1p23))
+#define AbsMask (0x7fffffffffffffff)
+
+static NOINLINE sv_f64_t
+__sv_sin_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp)
+{
+  return sv_call_f64 (sin, x, y, cmp);
+}
+
+/* A fast SVE implementation of sin based on trigonometric
+   instructions (FTMAD, FTSSEL, FTSMUL).
+   Maximum observed error in 2.52 ULP:
+   __sv_sin(0x1.2d2b00df69661p+19) got 0x1.10ace8f3e786bp-40
+				  want 0x1.10ace8f3e7868p-40.  */
+sv_f64_t
+__sv_sin_x (sv_f64_t x, const svbool_t pg)
+{
+  sv_f64_t n, r, r2, y;
+  sv_u64_t sign;
+  svbool_t cmp;
+
+  r = sv_as_f64_u64 (svand_n_u64_x (pg, sv_as_u64_f64 (x), AbsMask));
+  sign = svand_n_u64_x (pg, sv_as_u64_f64 (x), ~AbsMask);
+  cmp = svcmpge_u64 (pg, sv_as_u64_f64 (r), sv_as_u64_f64 (RangeVal));
+
+  /* n = rint(|x|/(pi/2)).  */
+  sv_f64_t q = sv_fma_f64_x (pg, InvPio2, r, Shift);
+  n = svsub_f64_x (pg, q, Shift);
+
+  /* r = |x| - n*(pi/2)  (range reduction into -pi/4 .. pi/4).  */
+  r = sv_fma_f64_x (pg, NegPio2_1, n, r);
+  r = sv_fma_f64_x (pg, NegPio2_2, n, r);
+  r = sv_fma_f64_x (pg, NegPio2_3, n, r);
+
+  /* Final multiplicative factor: 1.0 or x depending on bit #0 of q.  */
+  sv_f64_t f = svtssel_f64 (r, sv_as_u64_f64 (q));
+
+  /* sin(r) poly approx.  */
+  r2 = svtsmul_f64 (r, sv_as_u64_f64 (q));
+  y = sv_f64 (0.0);
+  y = svtmad_f64 (y, r2, 7);
+  y = svtmad_f64 (y, r2, 6);
+  y = svtmad_f64 (y, r2, 5);
+  y = svtmad_f64 (y, r2, 4);
+  y = svtmad_f64 (y, r2, 3);
+  y = svtmad_f64 (y, r2, 2);
+  y = svtmad_f64 (y, r2, 1);
+  y = svtmad_f64 (y, r2, 0);
+
+  /* Apply factor.  */
+  y = svmul_f64_x (pg, f, y);
+
+  /* sign = y^sign.  */
+  y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign));
+
+  /* No need to pass pg to specialcase here since cmp is a strict subset,
+     guaranteed by the cmpge above.  */
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_sin_specialcase (x, y, cmp);
+  return y;
+}
+
+PL_ALIAS (__sv_sin_x, _ZGVsMxv_sin)
+
+PL_SIG (SV, D, 1, sin, -3.1, 3.1)
+PL_TEST_ULP (__sv_sin, 2.03)
+PL_TEST_INTERVAL (__sv_sin, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (__sv_sin, 0x1p-4, 0x1p4, 500000)
+#endif
diff --git a/pl/math/sv_sinf_1u9.c b/pl/math/sv_sinf_1u9.c
new file mode 100644
index 0000000..9184ccd
--- /dev/null
+++ b/pl/math/sv_sinf_1u9.c
@@ -0,0 +1,84 @@
+/*
+ * Single-precision SVE sin(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+#define A3 (sv_f32 (__sv_sinf_data.coeffs[3]))
+#define A5 (sv_f32 (__sv_sinf_data.coeffs[2]))
+#define A7 (sv_f32 (__sv_sinf_data.coeffs[1]))
+#define A9 (sv_f32 (__sv_sinf_data.coeffs[0]))
+
+#define NegPi1 (sv_f32 (-0x1.921fb6p+1f))
+#define NegPi2 (sv_f32 (0x1.777a5cp-24f))
+#define NegPi3 (sv_f32 (0x1.ee59dap-49f))
+#define RangeVal (sv_f32 (0x1p20f))
+#define InvPi (sv_f32 (0x1.45f306p-2f))
+#define Shift (sv_f32 (0x1.8p+23f))
+#define AbsMask (0x7fffffff)
+
+static NOINLINE sv_f32_t
+__sv_sinf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+  return sv_call_f32 (sinf, x, y, cmp);
+}
+
+/* A fast SVE implementation of sinf.
+   Maximum error: 1.89 ULPs.
+   This maximum error is achieved at multiple values in [-2^18, 2^18]
+   but one example is:
+   __sv_sinf(0x1.9247a4p+0) got 0x1.fffff6p-1 want 0x1.fffffap-1.  */
+sv_f32_t
+__sv_sinf_x (sv_f32_t x, const svbool_t pg)
+{
+  sv_f32_t n, r, r2, y;
+  sv_u32_t sign, odd;
+  svbool_t cmp;
+
+  r = sv_as_f32_u32 (svand_n_u32_x (pg, sv_as_u32_f32 (x), AbsMask));
+  sign = svand_n_u32_x (pg, sv_as_u32_f32 (x), ~AbsMask);
+  cmp = svcmpge_u32 (pg, sv_as_u32_f32 (r), sv_as_u32_f32 (RangeVal));
+
+  /* n = rint(|x|/pi).  */
+  n = sv_fma_f32_x (pg, InvPi, r, Shift);
+  odd = svlsl_n_u32_x (pg, sv_as_u32_f32 (n), 31);
+  n = svsub_f32_x (pg, n, Shift);
+
+  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+  r = sv_fma_f32_x (pg, NegPi1, n, r);
+  r = sv_fma_f32_x (pg, NegPi2, n, r);
+  r = sv_fma_f32_x (pg, NegPi3, n, r);
+
+  /* sin(r) approx using a degree 9 polynomial from the Taylor series
+     expansion. Note that only the odd terms of this are non-zero.  */
+  r2 = svmul_f32_x (pg, r, r);
+  y = sv_fma_f32_x (pg, A9, r2, A7);
+  y = sv_fma_f32_x (pg, y, r2, A5);
+  y = sv_fma_f32_x (pg, y, r2, A3);
+  y = sv_fma_f32_x (pg, svmul_f32_x (pg, y, r2), r, r);
+
+  /* sign = y^sign^odd.  */
+  y = sv_as_f32_u32 (
+    sveor_u32_x (pg, sv_as_u32_f32 (y), sveor_u32_x (pg, sign, odd)));
+
+  /* No need to pass pg to specialcase here since cmp is a strict subset,
+     guaranteed by the cmpge above.  */
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_sinf_specialcase (x, y, cmp);
+  return y;
+}
+
+PL_ALIAS (__sv_sinf_x, _ZGVsMxv_sinf)
+
+PL_SIG (SV, F, 1, sin, -3.1, 3.1)
+PL_TEST_ULP (__sv_sinf, 1.40)
+PL_TEST_INTERVAL (__sv_sinf, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (__sv_sinf, 0x1p-4, 0x1p4, 500000)
+#endif
diff --git a/pl/math/sv_sinf_poly_data.c b/pl/math/sv_sinf_poly_data.c
new file mode 100644
index 0000000..1e1ab5e
--- /dev/null
+++ b/pl/math/sv_sinf_poly_data.c
@@ -0,0 +1,19 @@
+/*
+ * Data used in single-precision sin(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Polynomial coefficients for approximating sin(x) in single
+   precision. These are the non-zero coefficients from the
+   degree 9 Taylor series expansion of sin.  */
+
+const struct sv_sinf_data __sv_sinf_data = {.coeffs = {
+					      0x1.5b2e76p-19f,
+					      -0x1.9f42eap-13f,
+					      0x1.110df4p-7f,
+					      -0x1.555548p-3f,
+					    }};
diff --git a/pl/math/sv_tanf_3u5.c b/pl/math/sv_tanf_3u5.c
new file mode 100644
index 0000000..cca43bd
--- /dev/null
+++ b/pl/math/sv_tanf_3u5.c
@@ -0,0 +1,112 @@
+/*
+ * Single-precision vector tan(x) function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if SV_SUPPORTED
+
+/* Constants.  */
+#define NegPio2_1 (sv_f32 (-0x1.921fb6p+0f))
+#define NegPio2_2 (sv_f32 (0x1.777a5cp-25f))
+#define NegPio2_3 (sv_f32 (0x1.ee59dap-50f))
+#define InvPio2 (sv_f32 (0x1.45f306p-1f))
+#define RangeVal (sv_f32 (0x1p15f))
+#define Shift (sv_f32 (0x1.8p+23f))
+
+#define poly(i) sv_f32 (__tanf_poly_data.poly_tan[i])
+
+/* Use full Estrin's scheme to evaluate polynomial.  */
+static inline sv_f32_t
+eval_poly (svbool_t pg, sv_f32_t z)
+{
+  sv_f32_t z2 = svmul_f32_x (pg, z, z);
+  sv_f32_t z4 = svmul_f32_x (pg, z2, z2);
+  sv_f32_t y_10 = sv_fma_f32_x (pg, z, poly (1), poly (0));
+  sv_f32_t y_32 = sv_fma_f32_x (pg, z, poly (3), poly (2));
+  sv_f32_t y_54 = sv_fma_f32_x (pg, z, poly (5), poly (4));
+  sv_f32_t y_32_10 = sv_fma_f32_x (pg, z2, y_32, y_10);
+  sv_f32_t y = sv_fma_f32_x (pg, z4, y_54, y_32_10);
+  return y;
+}
+
+static NOINLINE sv_f32_t
+__sv_tanf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+{
+  return sv_call_f32 (tanf, x, y, cmp);
+}
+
+/* Fast implementation of SVE tanf.
+   Maximum error is 3.45 ULP:
+   __sv_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1
+			     want 0x1.ff9850p-1.  */
+sv_f32_t
+__sv_tanf_x (sv_f32_t x, const svbool_t pg)
+{
+  /* Determine whether input is too large to perform fast regression.  */
+  svbool_t cmp = svacge_f32 (pg, x, RangeVal);
+  svbool_t pred_minuszero = svcmpeq_f32 (pg, x, sv_f32 (-0.0));
+
+  /* n = rint(x/(pi/2)).  */
+  sv_f32_t q = sv_fma_f32_x (pg, InvPio2, x, Shift);
+  sv_f32_t n = svsub_f32_x (pg, q, Shift);
+  /* n is already a signed integer, simply convert it.  */
+  sv_s32_t in = sv_to_s32_f32_x (pg, n);
+  /* Determine if x lives in an interval, where |tan(x)| grows to infinity.  */
+  sv_s32_t alt = svand_s32_x (pg, in, sv_s32 (1));
+  svbool_t pred_alt = svcmpne_s32 (pg, alt, sv_s32 (0));
+
+  /* r = x - n * (pi/2)  (range reduction into 0 .. pi/4).  */
+  sv_f32_t r;
+  r = sv_fma_f32_x (pg, NegPio2_1, n, x);
+  r = sv_fma_f32_x (pg, NegPio2_2, n, r);
+  r = sv_fma_f32_x (pg, NegPio2_3, n, r);
+
+  /* If x lives in an interval, where |tan(x)|
+     - is finite, then use a polynomial approximation of the form
+       tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2).
+     - grows to infinity then use symmetries of tangent and the identity
+       tan(r) = cotan(pi/2 - r) to express tan(x) as 1/tan(-r). Finally, use
+       the same polynomial approximation of tan as above.  */
+
+  /* Perform additional reduction if required.  */
+  sv_f32_t z = svneg_f32_m (r, pred_alt, r);
+
+  /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4].  */
+  sv_f32_t z2 = svmul_f32_x (pg, z, z);
+  sv_f32_t p = eval_poly (pg, z2);
+  sv_f32_t y = sv_fma_f32_x (pg, svmul_f32_x (pg, z, z2), p, z);
+
+  /* Transform result back, if necessary.  */
+  sv_f32_t inv_y = svdiv_f32_x (pg, sv_f32 (1.0f), y);
+  y = svsel_f32 (pred_alt, inv_y, y);
+
+  /* Fast reduction does not handle the x = -0.0 case well,
+     therefore it is fixed here.  */
+  y = svsel_f32 (pred_minuszero, x, y);
+
+  /* No need to pass pg to specialcase here since cmp is a strict subset,
+     guaranteed by the cmpge above.  */
+  if (unlikely (svptest_any (pg, cmp)))
+    return __sv_tanf_specialcase (x, y, cmp);
+  return y;
+}
+
+PL_ALIAS (__sv_tanf_x, _ZGVsMxv_tanf)
+
+PL_SIG (SV, F, 1, tan, -3.1, 3.1)
+PL_TEST_ULP (__sv_tanf, 2.96)
+PL_TEST_INTERVAL (__sv_tanf, -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (__sv_tanf, 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (__sv_tanf, 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (__sv_tanf, 0x1p-23, 0.7, 50000)
+PL_TEST_INTERVAL (__sv_tanf, 0.7, 1.5, 50000)
+PL_TEST_INTERVAL (__sv_tanf, 1.5, 100, 50000)
+PL_TEST_INTERVAL (__sv_tanf, 100, 0x1p17, 50000)
+PL_TEST_INTERVAL (__sv_tanf, 0x1p17, inf, 50000)
+#endif
diff --git a/pl/math/tanf_3u3.c b/pl/math/tanf_3u3.c
new file mode 100644
index 0000000..ec006dc
--- /dev/null
+++ b/pl/math/tanf_3u3.c
@@ -0,0 +1,202 @@
+/*
+ * Single-precision scalar tan(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+#include "pairwise_hornerf.h"
+
+/* Useful constants.  */
+#define NegPio2_1 (-0x1.921fb6p+0f)
+#define NegPio2_2 (0x1.777a5cp-25f)
+#define NegPio2_3 (0x1.ee59dap-50f)
+/* Reduced from 0x1p20 to 0x1p17 to ensure 3.5ulps.  */
+#define RangeVal (0x1p17f)
+#define InvPio2 ((0x1.45f306p-1f))
+#define Shift (0x1.8p+23f)
+#define AbsMask (0x7fffffff)
+#define Pio4 (0x1.921fb6p-1)
+/* 2PI * 2^-64.  */
+#define Pio2p63 (0x1.921FB54442D18p-62)
+
+#define P(i) __tanf_poly_data.poly_tan[i]
+#define Q(i) __tanf_poly_data.poly_cotan[i]
+
+static inline float
+eval_P (float z)
+{
+  return PAIRWISE_HORNER_5 (z, z * z, P);
+}
+
+static inline float
+eval_Q (float z)
+{
+  return PAIRWISE_HORNER_3 (z, z * z, Q);
+}
+
+/* Reduction of the input argument x using Cody-Waite approach, such that x = r
+   + n * pi/2 with r lives in [-pi/4, pi/4] and n is a signed integer.  */
+static inline float
+reduce (float x, int32_t *in)
+{
+  /* n = rint(x/(pi/2)).  */
+  float r = x;
+  float q = fmaf (InvPio2, r, Shift);
+  float n = q - Shift;
+  /* There is no rounding here, n is representable by a signed integer.  */
+  *in = (int32_t) n;
+  /* r = x - n * (pi/2)  (range reduction into -pi/4 .. pi/4).  */
+  r = fmaf (NegPio2_1, n, r);
+  r = fmaf (NegPio2_2, n, r);
+  r = fmaf (NegPio2_3, n, r);
+  return r;
+}
+
+/* Table with 4/PI to 192 bit precision.  To avoid unaligned accesses
+   only 8 new bits are added per entry, making the table 4 times larger.  */
+static const uint32_t __inv_pio4[24]
+  = {0x000000a2, 0x0000a2f9, 0x00a2f983, 0xa2f9836e, 0xf9836e4e, 0x836e4e44,
+     0x6e4e4415, 0x4e441529, 0x441529fc, 0x1529fc27, 0x29fc2757, 0xfc2757d1,
+     0x2757d1f5, 0x57d1f534, 0xd1f534dd, 0xf534ddc0, 0x34ddc0db, 0xddc0db62,
+     0xc0db6295, 0xdb629599, 0x6295993c, 0x95993c43, 0x993c4390, 0x3c439041};
+
+/* Reduce the range of XI to a multiple of PI/2 using fast integer arithmetic.
+   XI is a reinterpreted float and must be >= 2.0f (the sign bit is ignored).
+   Return the modulo between -PI/4 and PI/4 and store the quadrant in NP.
+   Reduction uses a table of 4/PI with 192 bits of precision.  A 32x96->128 bit
+   multiply computes the exact 2.62-bit fixed-point modulo.  Since the result
+   can have at most 29 leading zeros after the binary point, the double
+   precision result is accurate to 33 bits.  */
+static inline double
+reduce_large (uint32_t xi, int *np)
+{
+  const uint32_t *arr = &__inv_pio4[(xi >> 26) & 15];
+  int shift = (xi >> 23) & 7;
+  uint64_t n, res0, res1, res2;
+
+  xi = (xi & 0xffffff) | 0x800000;
+  xi <<= shift;
+
+  res0 = xi * arr[0];
+  res1 = (uint64_t) xi * arr[4];
+  res2 = (uint64_t) xi * arr[8];
+  res0 = (res2 >> 32) | (res0 << 32);
+  res0 += res1;
+
+  n = (res0 + (1ULL << 61)) >> 62;
+  res0 -= n << 62;
+  double x = (int64_t) res0;
+  *np = n;
+  return x * Pio2p63;
+}
+
+/* Top 12 bits of the float representation with the sign bit cleared.  */
+static inline uint32_t
+top12 (float x)
+{
+  return (asuint (x) >> 20);
+}
+
+/* Fast single-precision tan implementation.
+   Maximum ULP error: 3.293ulps.
+   tanf(0x1.c849eap+16) got -0x1.fe8d98p-1 want -0x1.fe8d9ep-1.  */
+float
+tanf (float x)
+{
+  /* Get top words.  */
+  uint32_t ix = asuint (x);
+  uint32_t ia = ix & AbsMask;
+  uint32_t ia12 = ia >> 20;
+
+  /* Dispatch between no reduction (small numbers), fast reduction and
+     slow large numbers reduction. The reduction step determines r float
+     (|r| < pi/4) and n signed integer such that x = r + n * pi/2.  */
+  int32_t n;
+  float r;
+  if (ia12 < top12 (Pio4))
+    {
+      /* Optimize small values.  */
+      if (unlikely (ia12 < top12 (0x1p-12f)))
+	{
+	  if (unlikely (ia12 < top12 (0x1p-126f)))
+	    /* Force underflow for tiny x.  */
+	    force_eval_float (x * x);
+	  return x;
+	}
+
+      /* tan (x) ~= x + x^3 * P(x^2).  */
+      float x2 = x * x;
+      float y = eval_P (x2);
+      return fmaf (x2, x * y, x);
+    }
+  /* Similar to other trigonometric routines, fast inaccurate reduction is
+     performed for values of x from pi/4 up to RangeVal. In order to keep errors
+     below 3.5ulps, we set the value of RangeVal to 2^17. This might differ for
+     other trigonometric routines. Above this value more advanced but slower
+     reduction techniques need to be implemented to reach a similar accuracy.
+  */
+  else if (ia12 < top12 (RangeVal))
+    {
+      /* Fast inaccurate reduction.  */
+      r = reduce (x, &n);
+    }
+  else if (ia12 < 0x7f8)
+    {
+      /* Slow accurate reduction.  */
+      uint32_t sign = ix & ~AbsMask;
+      double dar = reduce_large (ia, &n);
+      float ar = (float) dar;
+      r = asfloat (asuint (ar) ^ sign);
+    }
+  else
+    {
+      /* tan(Inf or NaN) is NaN.  */
+      return __math_invalidf (x);
+    }
+
+  /* If x lives in an interval where |tan(x)|
+     - is finite then use an approximation of tangent in the form
+       tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2).
+     - grows to infinity then use an approximation of cotangent in the form
+       cotan(z) ~ 1/z + z * Q(z^2), where the reciprocal can be computed early.
+       Using symmetries of tangent and the identity tan(r) = cotan(pi/2 - r),
+       we only need to change the sign of r to obtain tan(x) from cotan(r).
+     This 2-interval approach requires 2 different sets of coefficients P and
+     Q, where Q is a lower order polynomial than P.  */
+
+  /* Determine if x lives in an interval where |tan(x)| grows to infinity.  */
+  uint32_t alt = (uint32_t) n & 1;
+
+  /* Perform additional reduction if required.  */
+  float z = alt ? -r : r;
+
+  /* Prepare backward transformation.  */
+  float z2 = r * r;
+  float offset = alt ? 1.0f / z : z;
+  float scale = alt ? z : z * z2;
+
+  /* Evaluate polynomial approximation of tan or cotan.  */
+  float p = alt ? eval_Q (z2) : eval_P (z2);
+
+  /* A unified way of assembling the result on both interval types.  */
+  return fmaf (scale, p, offset);
+}
+
+PL_SIG (S, F, 1, tan, -3.1, 3.1)
+PL_TEST_ULP (tanf, 2.80)
+PL_TEST_INTERVAL (tanf, 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (tanf, 0x1p-127, 0x1p-14, 50000)
+PL_TEST_INTERVAL (tanf, -0x1p-127, -0x1p-14, 50000)
+PL_TEST_INTERVAL (tanf, 0x1p-14, 0.7, 50000)
+PL_TEST_INTERVAL (tanf, -0x1p-14, -0.7, 50000)
+PL_TEST_INTERVAL (tanf, 0.7, 1.5, 50000)
+PL_TEST_INTERVAL (tanf, -0.7, -1.5, 50000)
+PL_TEST_INTERVAL (tanf, 1.5, 0x1p17, 50000)
+PL_TEST_INTERVAL (tanf, -1.5, -0x1p17, 50000)
+PL_TEST_INTERVAL (tanf, 0x1p17, 0x1p54, 50000)
+PL_TEST_INTERVAL (tanf, -0x1p17, -0x1p54, 50000)
+PL_TEST_INTERVAL (tanf, 0x1p54, inf, 50000)
+PL_TEST_INTERVAL (tanf, -0x1p54, -inf, 50000)
diff --git a/pl/math/tanf_data.c b/pl/math/tanf_data.c
new file mode 100644
index 0000000..a6b9d51
--- /dev/null
+++ b/pl/math/tanf_data.c
@@ -0,0 +1,45 @@
+/*
+ * Data used in single-precision tan(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct tanf_poly_data __tanf_poly_data = {
+.poly_tan = {
+/* Coefficients generated using:
+   poly = fpminimax((tan(sqrt(x))-sqrt(x))/x^(3/2), deg, [|single ...|], [a*a;b*b]);
+   optimize relative error
+   final prec : 23 bits
+   deg : 5
+   a : 0x1p-126 ^ 2
+   b : ((pi) / 0x1p2) ^ 2
+   dirty rel error: 0x1.f7c2e4p-25
+   dirty abs error: 0x1.f7c2ecp-25.  */
+0x1.55555p-2,
+0x1.11166p-3,
+0x1.b88a78p-5,
+0x1.7b5756p-6,
+0x1.4ef4cep-8,
+0x1.0e1e74p-7
+},
+.poly_cotan = {
+/* Coefficients generated using:
+   fpminimax(f(x) = (0x1p0 / tan(sqrt(x)) - 0x1p0 / sqrt(x)) / sqrt(x), deg, [|dtype ...|], [a;b])
+   optimize a single polynomial
+   optimize absolute error
+   final prec : 23 bits
+   working prec : 128 bits
+   deg : 3
+   a : 0x1p-126
+   b : (pi) / 0x1p2
+   dirty rel error : 0x1.81298cp-25
+   dirty abs error : 0x1.a8acf4p-25.  */
+-0x1.55555p-2, /* -0.33333325.  */
+-0x1.6c23e4p-6, /* -2.2225354e-2.  */
+-0x1.12dbap-9, /* -2.0969994e-3.  */
+-0x1.05a1c2p-12, /* -2.495116e-4.  */
+}
+};
diff --git a/pl/math/tanh_3u.c b/pl/math/tanh_3u.c
new file mode 100644
index 0000000..46d9fb3
--- /dev/null
+++ b/pl/math/tanh_3u.c
@@ -0,0 +1,82 @@
+/*
+ * Double-precision tanh(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "math_config.h"
+#include "estrin.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define InvLn2 0x1.71547652b82fep0
+#define Ln2hi 0x1.62e42fefa39efp-1
+#define Ln2lo 0x1.abc9e3b39803fp-56
+#define Shift 0x1.8p52
+#define C(i) __expm1_poly[i]
+
+#define BoringBound 0x403241bf835f9d5f /* asuint64 (0x1.241bf835f9d5fp+4).  */
+#define TinyBound 0x3e40000000000000   /* asuint64 (0x1p-27).  */
+#define One 0x3ff0000000000000
+
+static inline double
+expm1_inline (double x)
+{
+  /* Helper routine for calculating exp(x) - 1. Copied from expm1_2u5.c, with
+     several simplifications:
+     - No special-case handling for tiny or special values.
+     - Simpler combination of p and t in final stage of the algorithm.
+     - Use shift-and-add instead of ldexp to calculate t.  */
+
+  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+  double j = fma (InvLn2, x, Shift) - Shift;
+  int64_t i = j;
+  double f = fma (j, -Ln2hi, x);
+  f = fma (j, -Ln2lo, f);
+
+  /* Approximate expm1(f) using polynomial.  */
+  double f2 = f * f;
+  double f4 = f2 * f2;
+  double p = fma (f2, ESTRIN_10 (f, f2, f4, f4 * f4, C), f);
+
+  /* t = 2 ^ i.  */
+  double t = asdouble ((uint64_t) (i + 1023) << 52);
+  /* expm1(x) = p * t + (t - 1).  */
+  return fma (p, t, t - 1);
+}
+
+/* Approximation for double-precision tanh(x), using a simplified version of
+   expm1. The greatest observed error is 2.75 ULP:
+   tanh(-0x1.c143c3a44e087p-3) got -0x1.ba31ba4691ab7p-3
+			      want -0x1.ba31ba4691ab4p-3.  */
+double
+tanh (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t ia = ix & AbsMask;
+  uint64_t sign = ix & ~AbsMask;
+
+  if (unlikely (ia > BoringBound))
+    {
+      if (ia > 0x7ff0000000000000)
+	return __math_invalid (x);
+      return asdouble (One | sign);
+    }
+
+  if (unlikely (ia < TinyBound))
+    return x;
+
+  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+  double q = expm1_inline (2 * x);
+  return q / (q + 2);
+}
+
+PL_SIG (S, D, 1, tanh, -10.0, 10.0)
+PL_TEST_ULP (tanh, 2.26)
+PL_TEST_INTERVAL (tanh, 0, TinyBound, 1000)
+PL_TEST_INTERVAL (tanh, -0, -TinyBound, 1000)
+PL_TEST_INTERVAL (tanh, TinyBound, BoringBound, 100000)
+PL_TEST_INTERVAL (tanh, -TinyBound, -BoringBound, 100000)
+PL_TEST_INTERVAL (tanh, BoringBound, inf, 1000)
+PL_TEST_INTERVAL (tanh, -BoringBound, -inf, 1000)
diff --git a/pl/math/tanhf_2u6.c b/pl/math/tanhf_2u6.c
new file mode 100644
index 0000000..76e54a4
--- /dev/null
+++ b/pl/math/tanhf_2u6.c
@@ -0,0 +1,91 @@
+/*
+ * Single-precision tanh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define BoringBound                                                            \
+  0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for        \
+		negative).  */
+#define AbsMask 0x7fffffff
+#define One 0x3f800000
+
+#define Shift (0x1.8p23f)
+#define InvLn2 (0x1.715476p+0f)
+#define Ln2hi (0x1.62e4p-1f)
+#define Ln2lo (0x1.7f7d1cp-20f)
+
+#define C(i) __expm1f_poly[i]
+
+static inline float
+expm1f_inline (float x)
+{
+  /* Helper routine for calculating exp(x) - 1.
+     Copied from expm1f_1u6.c, with several simplifications:
+     - No special-case handling for tiny or special values, instead return early
+       from the main routine.
+     - No special handling for large values:
+       - No early return for infinity.
+       - Simpler combination of p and t in final stage of algorithm.
+       - |i| < 27, so can calculate t by simpler shift-and-add, instead of
+	 ldexpf (same as vector algorithm).  */
+
+  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+  float j = fmaf (InvLn2, x, Shift) - Shift;
+  int32_t i = j;
+  float f = fmaf (j, -Ln2hi, x);
+  f = fmaf (j, -Ln2lo, f);
+
+  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
+     Uses Estrin scheme, where the main expm1f routine uses Horner.  */
+  float f2 = f * f;
+  float p_01 = fmaf (f, C (1), C (0));
+  float p_23 = fmaf (f, C (3), C (2));
+  float p = fmaf (f2, p_23, p_01);
+  p = fmaf (f2 * f2, C (4), p);
+  p = fmaf (f2, p, f);
+
+  /* t = 2^i.  */
+  float t = asfloat ((uint32_t) (i + 127) << 23);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return fmaf (p, t, t - 1);
+}
+
+/* Approximation for single-precision tanh(x), using a simplified version of
+   expm1f. The maximum error is 2.58 ULP:
+   tanhf(0x1.fa5eep-5) got 0x1.f9ba02p-5
+		      want 0x1.f9ba08p-5.  */
+float
+tanhf (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t iax = ix & AbsMask;
+  uint32_t sign = ix & ~AbsMask;
+
+  if (unlikely (iax > BoringBound))
+    {
+      if (iax > 0x7f800000)
+	return __math_invalidf (x);
+      return asfloat (One | sign);
+    }
+
+  if (unlikely (iax < 0x34000000))
+    return x;
+
+  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+  float q = expm1f_inline (2 * x);
+  return q / (q + 2);
+}
+
+PL_SIG (S, F, 1, tanh, -10.0, 10.0)
+PL_TEST_ULP (tanhf, 2.09)
+PL_TEST_INTERVAL (tanhf, 0, 0x1p-23, 1000)
+PL_TEST_INTERVAL (tanhf, -0, -0x1p-23, 1000)
+PL_TEST_INTERVAL (tanhf, 0x1p-23, 0x1.205966p+3, 100000)
+PL_TEST_INTERVAL (tanhf, -0x1p-23, -0x1.205966p+3, 100000)
+PL_TEST_INTERVAL (tanhf, 0x1.205966p+3, inf, 100)
+PL_TEST_INTERVAL (tanhf, -0x1.205966p+3, -inf, 100)
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
new file mode 100644
index 0000000..e0f6ac7
--- /dev/null
+++ b/pl/math/test/mathbench_funcs.h
@@ -0,0 +1,86 @@
+// clang-format off
+/*
+ * Function entries for mathbench.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#define _ZSF1(fun, a, b) F(fun##f, a, b)
+#define _ZSD1(f, a, b) D(f, a, b)
+
+#ifdef __vpcs
+
+#define _ZVF1(fun, a, b) F(__s_##fun##f, a, b) VF(__v_##fun##f, a, b) VNF(__vn_##fun##f, a, b) VNF(_ZGVnN4v_##fun##f, a, b)
+#define _ZVD1(f, a, b) D(__s_##f, a, b) VD(__v_##f, a, b) VND(__vn_##f, a, b) VND(_ZGVnN2v_##f, a, b)
+
+#elif __aarch64__
+
+#define _ZVF1(fun, a, b) F(__s_##fun##f, a, b) VF(__v_##fun##f, a, b)
+#define _ZVD1(f, a, b) D(__s_##f, a, b) VD(__v_##f, a, b)
+
+#elif WANT_VMATH
+
+#define _ZVF1(fun, a, b) F(__s_##fun##f, a, b)
+#define _ZVD1(f, a, b) D(__s_##f, a, b)
+
+#else
+
+#define _ZVF1(f, a, b)
+#define _ZVD1(f, a, b)
+
+#endif
+
+#if WANT_SVE_MATH
+
+#define _ZSVF1(fun, a, b) SVF(__sv_##fun##f_x, a, b) SVF(_ZGVsMxv_##fun##f, a, b)
+#define _ZSVD1(f, a, b) SVD(__sv_##f##_x, a, b) SVD(_ZGVsMxv_##f, a, b)
+
+#else
+
+#define _ZSVF1(f, a, b)
+#define _ZSVD1(f, a, b)
+
+#endif
+
+/* No auto-generated wrappers for binary functions - they have be
+   manually defined in mathbench_wrappers.h. We have to define silent
+   macros for them anyway as they will be emitted by PL_SIG.  */
+#define _ZSF2(...)
+#define _ZSD2(...)
+#define _ZVF2(...)
+#define _ZVD2(...)
+#define _ZSVF2(...)
+#define _ZSVD2(...)
+
+#include "mathbench_funcs_gen.h"
+
+/* PL_SIG only emits entries for unary functions, since if a function
+   needs to be wrapped in mathbench there is no way for it to know the
+   same of the wrapper. Add entries for binary functions, or any other
+   exotic signatures that need wrapping, below.  */
+
+{"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}},
+{"atan2",  'd', 0, -10.0, 10.0, {.d = atan2_wrap}},
+{"powi",   'd', 0,  0.01, 11.1, {.d = powi_wrap}},
+
+{"__s_atan2f",       'f', 0,   -10.0, 10.0, {.f = __s_atan2f_wrap}},
+{"__s_atan2",        'd', 0,   -10.0, 10.0, {.d = __s_atan2_wrap}},
+{"__v_atan2f",       'f', 'v', -10.0, 10.0, {.vf = __v_atan2f_wrap}},
+{"__v_atan2",        'd', 'v', -10.0, 10.0, {.vd = __v_atan2_wrap}},
+{"__vn_atan2f",      'f', 'n', -10.0, 10.0, {.vnf = __vn_atan2f_wrap}},
+{"_ZGVnN4vv_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = _Z_atan2f_wrap}},
+{"__vn_atan2",       'd', 'n', -10.0, 10.0, {.vnd = __vn_atan2_wrap}},
+{"_ZGVnN2vv_atan2",  'd', 'n', -10.0, 10.0, {.vnd = _Z_atan2_wrap}},
+
+#if WANT_SVE_MATH
+{"__sv_atan2f_x",    'f', 's', -10.0, 10.0, {.svf = __sv_atan2f_wrap}},
+{"_ZGVsMxvv_atan2f", 'f', 's', -10.0, 10.0, {.svf = _Z_sv_atan2f_wrap}},
+{"__sv_atan2_x",     'd', 's', -10.0, 10.0, {.svd = __sv_atan2_wrap}},
+{"_ZGVsM2vv_atan2",  'd', 's', -10.0, 10.0, {.svd = _Z_sv_atan2_wrap}},
+{"__sv_powif_x",     'f', 's', -10.0, 10.0, {.svf = __sv_powif_wrap}},
+{"_ZGVsMxvv_powi",   'f', 's', -10.0, 10.0, {.svf = _Z_sv_powi_wrap}},
+{"__sv_powi_x",      'd', 's', -10.0, 10.0, {.svd = __sv_powi_wrap}},
+{"_ZGVsMxvv_powk",   'd', 's', -10.0, 10.0, {.svd = _Z_sv_powk_wrap}},
+#endif
+  // clang-format on
diff --git a/pl/math/test/mathbench_wrappers.h b/pl/math/test/mathbench_wrappers.h
new file mode 100644
index 0000000..eba960e
--- /dev/null
+++ b/pl/math/test/mathbench_wrappers.h
@@ -0,0 +1,133 @@
+/*
+ * Function wrappers for mathbench.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+static double
+atan2_wrap (double x)
+{
+  return atan2 (5.0, x);
+}
+
+static float
+atan2f_wrap (float x)
+{
+  return atan2f (5.0f, x);
+}
+
+static double
+powi_wrap (double x)
+{
+  return __builtin_powi (x, (int) round (x));
+}
+
+#if WANT_VMATH
+#if __aarch64__
+
+static double
+__s_atan2_wrap (double x)
+{
+  return __s_atan2 (5.0, x);
+}
+
+static float
+__s_atan2f_wrap (float x)
+{
+  return __s_atan2f (5.0f, x);
+}
+
+static v_double
+__v_atan2_wrap (v_double x)
+{
+  return __v_atan2 (v_double_dup (5.0), x);
+}
+
+static v_float
+__v_atan2f_wrap (v_float x)
+{
+  return __v_atan2f (v_float_dup (5.0f), x);
+}
+
+#ifdef __vpcs
+
+__vpcs static v_double
+__vn_atan2_wrap (v_double x)
+{
+  return __vn_atan2 (v_double_dup (5.0), x);
+}
+
+__vpcs static v_float
+__vn_atan2f_wrap (v_float x)
+{
+  return __vn_atan2f (v_float_dup (5.0f), x);
+}
+
+__vpcs static v_double
+_Z_atan2_wrap (v_double x)
+{
+  return _ZGVnN2vv_atan2 (v_double_dup (5.0), x);
+}
+
+__vpcs static v_float
+_Z_atan2f_wrap (v_float x)
+{
+  return _ZGVnN4vv_atan2f (v_float_dup (5.0f), x);
+}
+
+#endif // __vpcs
+#endif // __arch64__
+#endif // WANT_VMATH
+
+#if WANT_SVE_MATH
+
+static sv_float
+__sv_atan2f_wrap (sv_float x, sv_bool pg)
+{
+  return __sv_atan2f_x (x, svdup_n_f32 (5.0f), pg);
+}
+
+static sv_float
+_Z_sv_atan2f_wrap (sv_float x, sv_bool pg)
+{
+  return _ZGVsMxvv_atan2f (x, svdup_n_f32 (5.0f), pg);
+}
+
+static sv_double
+__sv_atan2_wrap (sv_double x, sv_bool pg)
+{
+  return __sv_atan2_x (x, svdup_n_f64 (5.0), pg);
+}
+
+static sv_double
+_Z_sv_atan2_wrap (sv_double x, sv_bool pg)
+{
+  return _ZGVsMxvv_atan2 (x, svdup_n_f64 (5.0), pg);
+}
+
+static sv_float
+_Z_sv_powi_wrap (sv_float x, sv_bool pg)
+{
+  return _ZGVsMxvv_powi (x, svcvt_s32_f32_x (pg, x), pg);
+}
+
+static sv_float
+__sv_powif_wrap (sv_float x, sv_bool pg)
+{
+  return __sv_powif_x (x, svcvt_s32_f32_x (pg, x), pg);
+}
+
+static sv_double
+_Z_sv_powk_wrap (sv_double x, sv_bool pg)
+{
+  return _ZGVsMxvv_powk (x, svcvt_s64_f64_x (pg, x), pg);
+}
+
+static sv_double
+__sv_powi_wrap (sv_double x, sv_bool pg)
+{
+  return __sv_powi_x (x, svcvt_s64_f64_x (pg, x), pg);
+}
+
+#endif // WANT_SVE_MATH
diff --git a/pl/math/test/pl_test.h b/pl/math/test/pl_test.h
new file mode 100644
index 0000000..467d1ca
--- /dev/null
+++ b/pl/math/test/pl_test.h
@@ -0,0 +1,33 @@
+/*
+ * PL macros for emitting various details about routines for consumption by
+ * runulp.sh.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception.
+ */
+
+/* Emit the max ULP threshold, l, for routine f. Piggy-back PL_TEST_EXPECT_FENV
+   on PL_TEST_ULP to add EXPECT_FENV to all scalar routines.  */
+#if !(V_SUPPORTED || SV_SUPPORTED)
+#define PL_TEST_ULP(f, l)                                                      \
+  PL_TEST_EXPECT_FENV_ALWAYS (f)                                               \
+  PL_TEST_ULP f l
+#else
+#define PL_TEST_ULP(f, l) PL_TEST_ULP f l
+#endif
+
+/* Emit aliases to allow test params to be mapped from aliases back to their
+   aliasees.  */
+#define PL_ALIAS(a, b) PL_TEST_ALIAS a b
+
+/* Emit routine name if e == 1 and f is expected to correctly trigger fenv
+   exceptions. e allows declaration to be emitted conditionally upon certain
+   build flags - defer expansion by one pass to allow those flags to be expanded
+   properly.  */
+#define PL_TEST_EXPECT_FENV(f, e) PL_TEST_EXPECT_FENV_ (f, e)
+#define PL_TEST_EXPECT_FENV_(f, e) PL_TEST_EXPECT_FENV_##e (f)
+#define PL_TEST_EXPECT_FENV_1(f) PL_TEST_EXPECT_FENV_ENABLED f
+#define PL_TEST_EXPECT_FENV_ALWAYS(f) PL_TEST_EXPECT_FENV (f, 1)
+
+#define PL_TEST_INTERVAL(f, lo, hi, n) PL_TEST_INTERVAL f lo hi n
+#define PL_TEST_INTERVAL_C(f, lo, hi, n, c) PL_TEST_INTERVAL f lo hi n c
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
new file mode 100755
index 0000000..4d02530
--- /dev/null
+++ b/pl/math/test/runulp.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+
+# ULP error check script.
+#
+# Copyright (c) 2019-2023, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+#set -x
+set -eu
+
+# cd to bin directory.
+cd "${0%/*}"
+
+flags="${ULPFLAGS:--q}"
+emu="$@"
+
+# Enable SVE testing
+WANT_SVE_MATH=${WANT_SVE_MATH:-0}
+
+FAIL=0
+PASS=0
+
+t() {
+	key=$(cat $ALIASES | { grep " $1$" || echo $1; } | awk '{print $1}')
+	L=$(cat $LIMITS | grep "^$key " | awk '{print $2}')
+	[[ $L =~ ^[0-9]+\.[0-9]+$ ]]
+	extra_flags=""
+	[[ -z "${5:-}" ]] || extra_flags="$extra_flags -c $5"
+	grep -q "^$key$" $FENV || extra_flags="$extra_flags -f"
+	$emu ./ulp -e $L $flags ${extra_flags} $1 $2 $3 $4 && PASS=$((PASS+1)) || FAIL=$((FAIL+1))
+}
+
+check() {
+	$emu ./ulp -f -q "$@" #>/dev/null
+}
+
+# Regression-test for correct NaN handling in atan2
+check atan2 0x1p-1022 0x1p-1000 x 0 0x1p-1022 40000
+check atan2 0x1.7887a0a717aefp+1017 0x1.7887a0a717aefp+1017 x -nan -nan
+check atan2 nan nan x -nan -nan
+
+# vector functions
+flags="${ULPFLAGS:--q}"
+runs=
+check __s_log10f 1 && runs=1
+runv=
+check __v_log10f 1 && runv=1
+runvn=
+check __vn_log10f 1 && runvn=1
+runsv=
+if [ $WANT_SVE_MATH -eq 1 ]; then
+check __sv_cosf 0 && runsv=1
+check __sv_cos  0 && runsv=1
+check __sv_sinf 0 && runsv=1
+check __sv_sin 0 && runsv=1
+# No guarantees about powi accuracy, so regression-test for exactness
+# w.r.t. the custom reference impl in ulp_wrappers.h
+check -q -f -e 0 __sv_powif  0  inf x  0  1000 100000 && runsv=1
+check -q -f -e 0 __sv_powif -0 -inf x  0  1000 100000 && runsv=1
+check -q -f -e 0 __sv_powif  0  inf x -0 -1000 100000 && runsv=1
+check -q -f -e 0 __sv_powif -0 -inf x -0 -1000 100000 && runsv=1
+check -q -f -e 0 __sv_powi   0  inf x  0  1000 100000 && runsv=1
+check -q -f -e 0 __sv_powi  -0 -inf x  0  1000 100000 && runsv=1
+check -q -f -e 0 __sv_powi   0  inf x -0 -1000 100000 && runsv=1
+check -q -f -e 0 __sv_powi  -0 -inf x -0 -1000 100000 && runsv=1
+fi
+
+while read F LO HI N C
+do
+	t $F $LO $HI $N $C
+done << EOF
+$(cat $INTERVALS)
+EOF
+
+[ 0 -eq $FAIL ] || {
+	echo "FAILED $FAIL PASSED $PASS"
+	exit 1
+}
diff --git a/pl/math/test/testcases/directed/acosh.tst b/pl/math/test/testcases/directed/acosh.tst
new file mode 100644
index 0000000..dd962bd
--- /dev/null
+++ b/pl/math/test/testcases/directed/acosh.tst
@@ -0,0 +1,19 @@
+; acosh.tst
+;
+; Copyright (c) 2009-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=acosh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=acosh op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=acosh op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=acosh op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=acosh op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+func=acosh op1=3ff00000.00000000 result=00000000.00000000 errno=0
+func=acosh op1=3fefffff.ffffffff result=7ff80000.00000001 errno=EDOM status=i
+func=acosh op1=00000000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=acosh op1=80000000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=acosh op1=bfefffff.ffffffff result=7ff80000.00000001 errno=EDOM status=i
+func=acosh op1=bff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=acosh op1=bff00000.00000001 result=7ff80000.00000001 errno=EDOM status=i
+func=acosh op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=acosh op1=7fe01ac0.7f03a83e result=40862e50.541778f1.8cc error=0
diff --git a/pl/math/test/testcases/directed/acoshf.tst b/pl/math/test/testcases/directed/acoshf.tst
new file mode 100644
index 0000000..606c615
--- /dev/null
+++ b/pl/math/test/testcases/directed/acoshf.tst
@@ -0,0 +1,19 @@
+; acoshf.tst
+;
+; Copyright (c) 2009-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=acoshf op1=7fc00001 result=7fc00001 errno=0
+func=acoshf op1=ffc00001 result=7fc00001 errno=0
+func=acoshf op1=7f800001 result=7fc00001 errno=0 status=i
+func=acoshf op1=ff800001 result=7fc00001 errno=0 status=i
+func=acoshf op1=7f800000 result=7f800000 errno=0
+func=acoshf op1=3f800000 result=00000000 errno=0
+func=acoshf op1=3f7fffff result=7fc00001 errno=EDOM status=i
+func=acoshf op1=00000000 result=7fc00001 errno=EDOM status=i
+func=acoshf op1=80000000 result=7fc00001 errno=EDOM status=i
+func=acoshf op1=bf7fffff result=7fc00001 errno=EDOM status=i
+func=acoshf op1=bf800000 result=7fc00001 errno=EDOM status=i
+func=acoshf op1=bf800001 result=7fc00001 errno=EDOM status=i
+func=acoshf op1=ff800000 result=7fc00001 errno=EDOM status=i
+func=acoshf op1=7f767efe result=42b2c19d.83e error=0
diff --git a/pl/math/test/testcases/directed/asinh.tst b/pl/math/test/testcases/directed/asinh.tst
new file mode 100644
index 0000000..1485dfe
--- /dev/null
+++ b/pl/math/test/testcases/directed/asinh.tst
@@ -0,0 +1,18 @@
+; asinh.tst
+;
+; Copyright (c) 2022-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=asinh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=asinh op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=asinh op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=asinh op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=asinh op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+func=asinh op1=fff00000.00000000 result=fff00000.00000000 errno=0
+func=asinh op1=00000000.00000000 result=00000000.00000000 errno=0
+func=asinh op1=80000000.00000000 result=80000000.00000000 errno=0
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=asinh op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux
+func=asinh op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux
diff --git a/pl/math/test/testcases/directed/asinhf.tst b/pl/math/test/testcases/directed/asinhf.tst
new file mode 100644
index 0000000..eb76a58
--- /dev/null
+++ b/pl/math/test/testcases/directed/asinhf.tst
@@ -0,0 +1,18 @@
+; asinhf.tst
+;
+; Copyright (c) 2007-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=asinhf op1=7fc00001 result=7fc00001 errno=0
+func=asinhf op1=ffc00001 result=7fc00001 errno=0
+func=asinhf op1=7f800001 result=7fc00001 errno=0 status=i
+func=asinhf op1=ff800001 result=7fc00001 errno=0 status=i
+func=asinhf op1=7f800000 result=7f800000 errno=0
+func=asinhf op1=ff800000 result=ff800000 errno=0
+func=asinhf op1=00000000 result=00000000 errno=0
+func=asinhf op1=80000000 result=80000000 errno=0
+; No exception is raised on certain machines (different version of glibc)
+; Same issue encountered with other function similar to x close to 0
+; Could be due to function so boring no flop is involved in some implementations
+func=asinhf op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=asinhf op1=80000001 result=80000001 errno=0 maybestatus=ux
diff --git a/pl/math/test/testcases/directed/atan.tst b/pl/math/test/testcases/directed/atan.tst
new file mode 100644
index 0000000..4c67055
--- /dev/null
+++ b/pl/math/test/testcases/directed/atan.tst
@@ -0,0 +1,22 @@
+; atan.tst
+;
+; Copyright (c) 1999-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=atan op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan op1=7ff00000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=atan op1=fff00000.00000000 result=bff921fb.54442d18.469 errno=0
+func=atan op1=00000000.00000000 result=00000000.00000000 errno=0
+func=atan op1=80000000.00000000 result=80000000.00000000 errno=0
+; Inconsistent behavior was detected for the following 2 cases.
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=atan op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux
+func=atan op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux
+
+func=atan op1=3ff00000.00000000 result=3fe921fb.54442d18.469 errno=0
+func=atan op1=bff00000.00000000 result=bfe921fb.54442d18.469 errno=0
diff --git a/pl/math/test/testcases/directed/atan2.tst b/pl/math/test/testcases/directed/atan2.tst
new file mode 100644
index 0000000..647b376
--- /dev/null
+++ b/pl/math/test/testcases/directed/atan2.tst
@@ -0,0 +1,110 @@
+; atan2.tst
+;
+; Copyright (c) 1999-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=atan2 op1=7ff00000.00000001 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000001 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000001 op2=7ff00000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000001 op2=fff00000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000001 op2=00000000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000001 op2=80000000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000001 op2=3ff00000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000001 op2=bff00000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000001 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000001 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000001 op2=7ff00000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000001 op2=fff00000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000001 op2=00000000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000001 op2=80000000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000001 op2=3ff00000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000001 op2=bff00000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff80000.00000001 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff80000.00000001 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff80000.00000001 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff80000.00000001 op2=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff80000.00000001 op2=7ff00000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff80000.00000001 op2=fff00000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff80000.00000001 op2=00000000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff80000.00000001 op2=80000000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff80000.00000001 op2=3ff00000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff80000.00000001 op2=bff00000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff80000.00000001 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff80000.00000001 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff80000.00000001 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff80000.00000001 op2=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff80000.00000001 op2=7ff00000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff80000.00000001 op2=fff00000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff80000.00000001 op2=00000000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff80000.00000001 op2=80000000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff80000.00000001 op2=3ff00000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff80000.00000001 op2=bff00000.00000000 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff00000.00000000 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000000 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=7ff00000.00000000 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff00000.00000000 op2=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=7ff00000.00000000 op2=7ff00000.00000000 result=3fe921fb.54442d18.469 errno=0
+func=atan2 op1=7ff00000.00000000 op2=fff00000.00000000 result=4002d97c.7f3321d2.34f errno=0
+func=atan2 op1=7ff00000.00000000 op2=00000000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=atan2 op1=7ff00000.00000000 op2=80000000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=atan2 op1=7ff00000.00000000 op2=3ff00000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=atan2 op1=7ff00000.00000000 op2=bff00000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=atan2 op1=fff00000.00000000 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000000 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=fff00000.00000000 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff00000.00000000 op2=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=fff00000.00000000 op2=7ff00000.00000000 result=bfe921fb.54442d18.469 errno=0
+func=atan2 op1=fff00000.00000000 op2=fff00000.00000000 result=c002d97c.7f3321d2.34f errno=0
+func=atan2 op1=fff00000.00000000 op2=00000000.00000000 result=bff921fb.54442d18.469 errno=0
+func=atan2 op1=fff00000.00000000 op2=80000000.00000000 result=bff921fb.54442d18.469 errno=0
+func=atan2 op1=fff00000.00000000 op2=3ff00000.00000000 result=bff921fb.54442d18.469 errno=0
+func=atan2 op1=fff00000.00000000 op2=bff00000.00000000 result=bff921fb.54442d18.469 errno=0
+func=atan2 op1=00000000.00000000 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=00000000.00000000 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=00000000.00000000 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=00000000.00000000 op2=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=00000000.00000000 op2=7ff00000.00000000 result=00000000.00000000 errno=0
+func=atan2 op1=00000000.00000000 op2=fff00000.00000000 result=400921fb.54442d18.469 errno=0
+func=atan2 op1=00000000.00000000 op2=00000000.00000000 result=00000000.00000000 errno=0
+func=atan2 op1=00000000.00000000 op2=80000000.00000000 result=400921fb.54442d18.469 errno=0
+func=atan2 op1=00000000.00000000 op2=3ff00000.00000000 result=00000000.00000000 errno=0
+func=atan2 op1=00000000.00000000 op2=bff00000.00000000 result=400921fb.54442d18.469 errno=0
+; No exception is raised on certain machines (different version of glibc)
+; Same issue encountered with other function similar to x close to 0
+; Could be due to function so boring no flop is involved in some implementations
+func=atan2 op1=00000000.00000001 op2=3ff00000.00000000 result=00000000.00000001 errno=0 maybestatus=ux
+func=atan2 op1=80000000.00000000 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=80000000.00000000 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=80000000.00000000 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=80000000.00000000 op2=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=80000000.00000000 op2=7ff00000.00000000 result=80000000.00000000 errno=0
+func=atan2 op1=80000000.00000000 op2=fff00000.00000000 result=c00921fb.54442d18.469 errno=0
+func=atan2 op1=80000000.00000000 op2=00000000.00000000 result=80000000.00000000 errno=0
+func=atan2 op1=80000000.00000000 op2=80000000.00000000 result=c00921fb.54442d18.469 errno=0
+func=atan2 op1=80000000.00000000 op2=3ff00000.00000000 result=80000000.00000000 errno=0
+func=atan2 op1=80000000.00000000 op2=bff00000.00000000 result=c00921fb.54442d18.469 errno=0
+; No exception is raised on certain machines (different version of glibc)
+; Same issue encountered with other function similar to x close to 0
+; Could be due to function so boring no flop is involved in some implementations
+func=atan2 op1=80000000.00000001 op2=3ff00000.00000000 result=80000000.00000001 errno=0 maybestatus=ux
+func=atan2 op1=3ff00000.00000000 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=3ff00000.00000000 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=3ff00000.00000000 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=3ff00000.00000000 op2=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=3ff00000.00000000 op2=7ff00000.00000000 result=00000000.00000000 errno=0
+func=atan2 op1=3ff00000.00000000 op2=fff00000.00000000 result=400921fb.54442d18.469 errno=0
+func=atan2 op1=3ff00000.00000000 op2=00000000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=atan2 op1=3ff00000.00000000 op2=80000000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=atan2 op1=3ff00000.00000000 op2=3ff00000.00000000 result=3fe921fb.54442d18.469 errno=0
+func=atan2 op1=3ff00000.00000000 op2=bff00000.00000000 result=4002d97c.7f3321d2.34f errno=0
+func=atan2 op1=bff00000.00000000 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=bff00000.00000000 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan2 op1=bff00000.00000000 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=bff00000.00000000 op2=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan2 op1=bff00000.00000000 op2=7ff00000.00000000 result=80000000.00000000 errno=0
+func=atan2 op1=bff00000.00000000 op2=fff00000.00000000 result=c00921fb.54442d18.469 errno=0
+func=atan2 op1=bff00000.00000000 op2=00000000.00000000 result=bff921fb.54442d18.469 errno=0
+func=atan2 op1=bff00000.00000000 op2=80000000.00000000 result=bff921fb.54442d18.469 errno=0
+func=atan2 op1=bff00000.00000000 op2=3ff00000.00000000 result=bfe921fb.54442d18.469 errno=0
+func=atan2 op1=bff00000.00000000 op2=bff00000.00000000 result=c002d97c.7f3321d2.34f errno=0
+func=atan2 op1=3ff00000.00000000 op2=3ff00000.00000000 result=3fe921fb.54442d18 errno=0
diff --git a/pl/math/test/testcases/directed/atan2f.tst b/pl/math/test/testcases/directed/atan2f.tst
new file mode 100644
index 0000000..85c5c5d
--- /dev/null
+++ b/pl/math/test/testcases/directed/atan2f.tst
@@ -0,0 +1,121 @@
+; atan2f.tst
+;
+; Copyright (c) 1999-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=atan2f op1=7f800001 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=7fc00001 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=ffc00001 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=7f800000 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=ff800000 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=00000000 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=80000000 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=3f800000 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800001 op2=bf800000 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=7fc00001 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=ffc00001 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=7f800000 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=ff800000 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=00000000 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=80000000 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=3f800000 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800001 op2=bf800000 result=7fc00001 errno=0 status=i
+func=atan2f op1=7fc00001 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=7fc00001 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=7fc00001 op2=7fc00001 result=7fc00001 errno=0
+func=atan2f op1=7fc00001 op2=ffc00001 result=7fc00001 errno=0
+func=atan2f op1=7fc00001 op2=7f800000 result=7fc00001 errno=0
+func=atan2f op1=7fc00001 op2=ff800000 result=7fc00001 errno=0
+func=atan2f op1=7fc00001 op2=00000000 result=7fc00001 errno=0
+func=atan2f op1=7fc00001 op2=80000000 result=7fc00001 errno=0
+func=atan2f op1=7fc00001 op2=3f800000 result=7fc00001 errno=0
+func=atan2f op1=7fc00001 op2=bf800000 result=7fc00001 errno=0
+func=atan2f op1=ffc00001 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=ffc00001 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=ffc00001 op2=7fc00001 result=ffc00001 errno=0
+func=atan2f op1=ffc00001 op2=ffc00001 result=ffc00001 errno=0
+func=atan2f op1=ffc00001 op2=7f800000 result=ffc00001 errno=0
+func=atan2f op1=ffc00001 op2=ff800000 result=ffc00001 errno=0
+func=atan2f op1=ffc00001 op2=00000000 result=ffc00001 errno=0
+func=atan2f op1=ffc00001 op2=80000000 result=ffc00001 errno=0
+func=atan2f op1=ffc00001 op2=3f800000 result=ffc00001 errno=0
+func=atan2f op1=ffc00001 op2=bf800000 result=ffc00001 errno=0
+func=atan2f op1=7f800000 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800000 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=7f800000 op2=7fc00001 result=7fc00001 errno=0
+func=atan2f op1=7f800000 op2=ffc00001 result=7fc00001 errno=0
+func=atan2f op1=7f800000 op2=7f800000 result=3f490fda.a22 errno=0
+func=atan2f op1=7f800000 op2=ff800000 result=4016cbe3.f99 errno=0
+func=atan2f op1=7f800000 op2=00000000 result=3fc90fda.a22 errno=0
+func=atan2f op1=7f800000 op2=80000000 result=3fc90fda.a22 errno=0
+func=atan2f op1=7f800000 op2=3f800000 result=3fc90fda.a22 errno=0
+func=atan2f op1=7f800000 op2=bf800000 result=3fc90fda.a22 errno=0
+func=atan2f op1=ff800000 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800000 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=ff800000 op2=7fc00001 result=7fc00001 errno=0
+func=atan2f op1=ff800000 op2=ffc00001 result=ffc00001 errno=0
+func=atan2f op1=ff800000 op2=7f800000 result=bf490fda.a22 errno=0
+func=atan2f op1=ff800000 op2=ff800000 result=c016cbe3.f99 errno=0
+func=atan2f op1=ff800000 op2=00000000 result=bfc90fda.a22 errno=0
+func=atan2f op1=ff800000 op2=80000000 result=bfc90fda.a22 errno=0
+func=atan2f op1=ff800000 op2=3f800000 result=bfc90fda.a22 errno=0
+func=atan2f op1=ff800000 op2=bf800000 result=bfc90fda.a22 errno=0
+func=atan2f op1=00000000 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=00000000 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=00000000 op2=7fc00001 result=7fc00001 errno=0
+func=atan2f op1=00000000 op2=ffc00001 result=ffc00001 errno=0
+func=atan2f op1=00000000 op2=7f800000 result=00000000 errno=0
+func=atan2f op1=00000000 op2=ff800000 result=40490fda.a22 errno=0
+func=atan2f op1=00000000 op2=00000000 result=00000000 errno=0
+func=atan2f op1=00000000 op2=80000000 result=40490fda.a22 errno=0
+func=atan2f op1=00000000 op2=3f800000 result=00000000 errno=0
+func=atan2f op1=00000000 op2=bf800000 result=40490fda.a22 errno=0
+; No exception is raised on certain machines (different version of glibc)
+; Same issue encountered with other function similar to x close to 0
+; Could be due to function so boring no flop is involved in some implementations
+func=atan2f op1=00000001 op2=3f800000 result=00000001 errno=0 maybestatus=ux
+
+func=atan2f op1=80000000 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=80000000 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=80000000 op2=7fc00001 result=7fc00001 errno=0
+func=atan2f op1=80000000 op2=ffc00001 result=ffc00001 errno=0
+func=atan2f op1=80000000 op2=7f800000 result=80000000 errno=0
+func=atan2f op1=80000000 op2=ff800000 result=c0490fda.a22 errno=0
+func=atan2f op1=80000000 op2=00000000 result=80000000 errno=0
+func=atan2f op1=80000000 op2=80000000 result=c0490fda.a22 errno=0
+func=atan2f op1=80000000 op2=3f800000 result=80000000 errno=0
+func=atan2f op1=80000000 op2=bf800000 result=c0490fda.a22 errno=0
+; No exception is raised on certain machines (different version of glibc)
+; Same issue encountered with other function similar to x close to 0
+; Could be due to function so boring no flop is involved in some implementations
+func=atan2f op1=80000001 op2=3f800000 result=80000001 errno=0 maybestatus=ux
+
+func=atan2f op1=3f800000 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=3f800000 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=3f800000 op2=7fc00001 result=7fc00001 errno=0
+func=atan2f op1=3f800000 op2=ffc00001 result=ffc00001 errno=0
+func=atan2f op1=3f800000 op2=7f800000 result=00000000 errno=0
+func=atan2f op1=3f800000 op2=ff800000 result=40490fda.a22 errno=0
+func=atan2f op1=3f800000 op2=00000000 result=3fc90fda.a22 errno=0
+func=atan2f op1=3f800000 op2=80000000 result=3fc90fda.a22 errno=0
+func=atan2f op1=3f800000 op2=3f800000 result=3f490fda.a22 errno=0
+func=atan2f op1=3f800000 op2=bf800000 result=4016cbe3.f99 errno=0
+func=atan2f op1=bf800000 op2=7f800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=bf800000 op2=ff800001 result=7fc00001 errno=0 status=i
+func=atan2f op1=bf800000 op2=7fc00001 result=7fc00001 errno=0
+func=atan2f op1=bf800000 op2=ffc00001 result=ffc00001 errno=0
+func=atan2f op1=bf800000 op2=7f800000 result=80000000 errno=0
+func=atan2f op1=bf800000 op2=ff800000 result=c0490fda.a22 errno=0
+func=atan2f op1=bf800000 op2=00000000 result=bfc90fda.a22 errno=0
+func=atan2f op1=bf800000 op2=80000000 result=bfc90fda.a22 errno=0
+func=atan2f op1=bf800000 op2=3f800000 result=bf490fda.a22 errno=0
+func=atan2f op1=bf800000 op2=bf800000 result=c016cbe3.f99 errno=0
+func=atan2f op1=8005f16d op2=002bb601 result=be0a60a5.d88 error=0
+func=atan2f op1=80818ec8 op2=80ba5db9 result=c0222eda.f42 error=0
+
+func=atan2f op1=ff7fffff op2=ff7fffff result=c016cbe3.f99 errno=0
+func=atan2f op1=bfc00001 op2=7f7fffff result=80300000.700 errno=0 status=u
+func=atan2f op1=80800001 op2=40000000 result=80400000.800 errno=0 status=u
diff --git a/pl/math/test/testcases/directed/atanf.tst b/pl/math/test/testcases/directed/atanf.tst
new file mode 100644
index 0000000..0a0bfc2
--- /dev/null
+++ b/pl/math/test/testcases/directed/atanf.tst
@@ -0,0 +1,22 @@
+; atanf.tst
+;
+; Copyright (c) 2007-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=atanf op1=7fc00001 result=7fc00001 errno=0
+func=atanf op1=ffc00001 result=7fc00001 errno=0
+func=atanf op1=7f800001 result=7fc00001 errno=0 status=i
+func=atanf op1=ff800001 result=7fc00001 errno=0 status=i
+func=atanf op1=7f800000 result=3fc90fda.a22 errno=0
+func=atanf op1=ff800000 result=bfc90fda.a22 errno=0
+func=atanf op1=00000000 result=00000000 errno=0
+func=atanf op1=80000000 result=80000000 errno=0
+; Inconsistent behavior was detected for the following 2 cases.
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=atanf op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=atanf op1=80000001 result=80000001 errno=0 maybestatus=ux
+
+func=atanf op1=3f800000 result=3f490fda.a22 errno=0
+func=atanf op1=bf800000 result=bf490fda.a22 errno=0
diff --git a/pl/math/test/testcases/directed/atanh.tst b/pl/math/test/testcases/directed/atanh.tst
new file mode 100644
index 0000000..d96ff32
--- /dev/null
+++ b/pl/math/test/testcases/directed/atanh.tst
@@ -0,0 +1,22 @@
+; atanh.tst
+;
+; Copyright (c) 2009-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=atanh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atanh op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atanh op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atanh op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atanh op1=7ff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=atanh op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=atanh op1=3ff00000.00000001 result=7ff80000.00000001 errno=EDOM status=i
+func=atanh op1=bff00000.00000001 result=7ff80000.00000001 errno=EDOM status=i
+func=atanh op1=3ff00000.00000000 result=7ff00000.00000000 errno=ERANGE status=z
+func=atanh op1=bff00000.00000000 result=fff00000.00000000 errno=ERANGE status=z
+func=atanh op1=00000000.00000000 result=00000000.00000000 errno=0
+func=atanh op1=80000000.00000000 result=80000000.00000000 errno=0
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=atanh op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux
+func=atanh op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux
diff --git a/pl/math/test/testcases/directed/atanhf.tst b/pl/math/test/testcases/directed/atanhf.tst
new file mode 100644
index 0000000..21a68a6
--- /dev/null
+++ b/pl/math/test/testcases/directed/atanhf.tst
@@ -0,0 +1,23 @@
+; atanhf.tst
+;
+; Copyright (c) 2009-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=atanhf op1=7fc00001 result=7fc00001 errno=0
+func=atanhf op1=ffc00001 result=7fc00001 errno=0
+func=atanhf op1=7f800001 result=7fc00001 errno=0 status=i
+func=atanhf op1=ff800001 result=7fc00001 errno=0 status=i
+func=atanhf op1=7f800000 result=7fc00001 errno=EDOM status=i
+func=atanhf op1=ff800000 result=7fc00001 errno=EDOM status=i
+func=atanhf op1=3f800001 result=7fc00001 errno=EDOM status=i
+func=atanhf op1=bf800001 result=7fc00001 errno=EDOM status=i
+func=atanhf op1=3f800000 result=7f800000 errno=ERANGE status=z
+func=atanhf op1=bf800000 result=ff800000 errno=ERANGE status=z
+func=atanhf op1=00000000 result=00000000 errno=0
+func=atanhf op1=80000000 result=80000000 errno=0
+
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=atanhf op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=atanhf op1=80000001 result=80000001 errno=0 maybestatus=ux
diff --git a/pl/math/test/testcases/directed/cbrtf.tst b/pl/math/test/testcases/directed/cbrtf.tst
new file mode 100644
index 0000000..0dd8d09
--- /dev/null
+++ b/pl/math/test/testcases/directed/cbrtf.tst
@@ -0,0 +1,29 @@
+; cbrtf.tst
+;
+; Copyright (c) 2009-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=cbrtf op1=7f800000 result=7f800000 errno=0
+func=cbrtf op1=ff800000 result=ff800000 errno=0
+func=cbrtf op1=7f800001 result=7fc00001 errno=0 status=i
+func=cbrtf op1=7fc00001 result=7fc00001 errno=0
+func=cbrtf op1=00000000 result=00000000 errno=0
+func=cbrtf op1=00000001 result=26a14517.cc7 errno=0
+func=cbrtf op1=00000002 result=26cb2ff5.29f errno=0
+func=cbrtf op1=00000003 result=26e89768.579 errno=0
+func=cbrtf op1=00000004 result=27000000.000 errno=0
+func=cbrtf op1=00400000 result=2a4b2ff5.29f errno=0
+func=cbrtf op1=00800000 result=2a800000.000 errno=0
+func=cbrtf op1=3f800000 result=3f800000.000 errno=0
+func=cbrtf op1=40000000 result=3fa14517.cc7 errno=0
+func=cbrtf op1=7f7fffff result=54cb2ff4.e63 errno=0
+func=cbrtf op1=80000000 result=80000000 errno=0
+func=cbrtf op1=80000001 result=a6a14517.cc7 errno=0
+func=cbrtf op1=80000002 result=a6cb2ff5.29f errno=0
+func=cbrtf op1=80000003 result=a6e89768.579 errno=0
+func=cbrtf op1=80000004 result=a7000000.000 errno=0
+func=cbrtf op1=80400000 result=aa4b2ff5.29f errno=0
+func=cbrtf op1=80800000 result=aa800000.000 errno=0
+func=cbrtf op1=bf800000 result=bf800000.000 errno=0
+func=cbrtf op1=c0000000 result=bfa14517.cc7 errno=0
+func=cbrtf op1=ff7fffff result=d4cb2ff4.e63 errno=0
diff --git a/pl/math/test/testcases/directed/cosh.tst b/pl/math/test/testcases/directed/cosh.tst
new file mode 100644
index 0000000..c4efacb
--- /dev/null
+++ b/pl/math/test/testcases/directed/cosh.tst
@@ -0,0 +1,15 @@
+; cosh.tst
+;
+; Copyright (c) 1999-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=cosh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=cosh op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=cosh op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=cosh op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=cosh op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+func=cosh op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox
+func=cosh op1=fff00000.00000000 result=7ff00000.00000000 errno=0
+func=cosh op1=ffefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox
+func=cosh op1=00000000.00000000 result=3ff00000.00000000 errno=0
+func=cosh op1=80000000.00000000 result=3ff00000.00000000 errno=0
diff --git a/pl/math/test/testcases/directed/coshf.tst b/pl/math/test/testcases/directed/coshf.tst
new file mode 100644
index 0000000..2b967e7
--- /dev/null
+++ b/pl/math/test/testcases/directed/coshf.tst
@@ -0,0 +1,15 @@
+; coshf.tst
+;
+; Copyright (c) 2007-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=coshf op1=7fc00001 result=7fc00001 errno=0
+func=coshf op1=ffc00001 result=7fc00001 errno=0
+func=coshf op1=7f800001 result=7fc00001 errno=0 status=i
+func=coshf op1=ff800001 result=7fc00001 errno=0 status=i
+func=coshf op1=7f800000 result=7f800000 errno=0
+func=coshf op1=7f7fffff result=7f800000 errno=ERANGE status=ox
+func=coshf op1=ff800000 result=7f800000 errno=0
+func=coshf op1=ff7fffff result=7f800000 errno=ERANGE status=ox
+func=coshf op1=00000000 result=3f800000 errno=0
+func=coshf op1=80000000 result=3f800000 errno=0
diff --git a/pl/math/test/testcases/directed/erfc.tst b/pl/math/test/testcases/directed/erfc.tst
new file mode 100644
index 0000000..c03fc59
--- /dev/null
+++ b/pl/math/test/testcases/directed/erfc.tst
@@ -0,0 +1,23 @@
+; erfc.tst - Directed test cases for erfc
+;
+; Copyright (c) 2022-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=erfc op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=erfc op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=erfc op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=erfc op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=erfc op1=7ff00000.00000000 result=00000000.00000000 errno=0
+func=erfc op1=7fefffff.ffffffff result=00000000.00000000 errno=ERANGE status=ux
+; We deliberately turned off errno setting in erf, as standard simply
+; state that errno `may` be set to ERANGE in case of underflow.
+; As a result the following condition on errno cannot be satisfied.
+;
+; func=erfc op1=403b44af.48b01531 result=00000000.00000000 errno=ERANGE status=ux
+;
+func=erfc op1=c03b44af.48b01531 result=40000000.00000000 errno=0
+func=erfc op1=403bffff.ffffffff result=00000000.00000000 errno=ERANGE status=ux
+func=erfc op1=c03bffff.ffffffff result=40000000.00000000 errno=0
+func=erfc op1=fff00000.00000000 result=40000000.00000000 errno=0
+func=erfc op1=00000000.00000000 result=3ff00000.00000000 errno=0
+func=erfc op1=80000000.00000000 result=3ff00000.00000000 errno=0
diff --git a/pl/math/test/testcases/directed/erfcf.tst b/pl/math/test/testcases/directed/erfcf.tst
new file mode 100644
index 0000000..719bacc
--- /dev/null
+++ b/pl/math/test/testcases/directed/erfcf.tst
@@ -0,0 +1,14 @@
+; erfcf.tst - Directed test cases for erfcf
+;
+; Copyright (c) 2007-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=erfcf op1=7fc00001 result=7fc00001 errno=0
+func=erfcf op1=ffc00001 result=7fc00001 errno=0
+func=erfcf op1=7f800001 result=7fc00001 errno=0 status=i
+func=erfcf op1=ff800001 result=7fc00001 errno=0 status=i
+func=erfcf op1=7f800000 result=00000000 errno=0
+func=erfcf op1=7f7fffff result=00000000 errno=ERANGE status=ux
+func=erfcf op1=ff800000 result=40000000 errno=0
+func=erfcf op1=00000000 result=3f800000 errno=0
+func=erfcf op1=80000000 result=3f800000 errno=0
diff --git a/pl/math/test/testcases/directed/erff.tst b/pl/math/test/testcases/directed/erff.tst
new file mode 100644
index 0000000..9b1d3d5
--- /dev/null
+++ b/pl/math/test/testcases/directed/erff.tst
@@ -0,0 +1,17 @@
+; erff.tst
+;
+; Copyright (c) 2007-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=erff op1=7fc00001 result=7fc00001 errno=0
+func=erff op1=ffc00001 result=7fc00001 errno=0
+func=erff op1=7f800001 result=7fc00001 errno=0 status=i
+func=erff op1=ff800001 result=7fc00001 errno=0 status=i
+func=erff op1=7f800000 result=3f800000 errno=0
+func=erff op1=ff800000 result=bf800000 errno=0
+func=erff op1=00000000 result=00000000 errno=ERANGE
+func=erff op1=80000000 result=80000000 errno=ERANGE
+func=erff op1=00000001 result=00000001 errno=0 status=ux
+func=erff op1=80000001 result=80000001 errno=0 status=ux
+func=erff op1=3f800000 result=3f57bb3d.3a0 errno=0
+func=erff op1=bf800000 result=bf57bb3d.3a0 errno=0
diff --git a/pl/math/test/testcases/directed/expm1.tst b/pl/math/test/testcases/directed/expm1.tst
new file mode 100644
index 0000000..609d6f4
--- /dev/null
+++ b/pl/math/test/testcases/directed/expm1.tst
@@ -0,0 +1,21 @@
+; expm1.tst
+;
+; Copyright (c) 2009-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=expm1 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=expm1 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=expm1 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=expm1 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=expm1 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+func=expm1 op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox
+func=expm1 op1=fff00000.00000000 result=bff00000.00000000 errno=0
+func=expm1 op1=ffefffff.ffffffff result=bff00000.00000000 errno=0
+func=expm1 op1=00000000.00000000 result=00000000.00000000 errno=0
+func=expm1 op1=80000000.00000000 result=80000000.00000000 errno=0
+; Inconsistent behavior was detected for the following 2 cases.
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=expm1 op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux
+func=expm1 op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux
diff --git a/pl/math/test/testcases/directed/expm1f.tst b/pl/math/test/testcases/directed/expm1f.tst
new file mode 100644
index 0000000..44c3842
--- /dev/null
+++ b/pl/math/test/testcases/directed/expm1f.tst
@@ -0,0 +1,57 @@
+; expm1f.tst
+;
+; Copyright (c) 2009-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=expm1f op1=7fc00001 result=7fc00001 errno=0
+func=expm1f op1=ffc00001 result=7fc00001 errno=0
+func=expm1f op1=7f800001 result=7fc00001 errno=0 status=i
+func=expm1f op1=ff800001 result=7fc00001 errno=0 status=i
+func=expm1f op1=7f800000 result=7f800000 errno=0
+func=expm1f op1=7f7fffff result=7f800000 errno=ERANGE status=ox
+func=expm1f op1=ff800000 result=bf800000 errno=0
+func=expm1f op1=ff7fffff result=bf800000 errno=0
+func=expm1f op1=00000000 result=00000000 errno=0
+func=expm1f op1=80000000 result=80000000 errno=0
+
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+
+func=expm1f op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=expm1f op1=80000001 result=80000001 errno=0 maybestatus=ux
+
+func=expm1f op1=42b145c0 result=7f6ac2dd.9b8 errno=0
+
+; Check both sides of the over/underflow thresholds in the code.
+func=expm1f op1=c2000000 result=bf7fffff.fff error=0
+func=expm1f op1=c2000001 result=bf7fffff.fff error=0
+func=expm1f op1=43000000 result=7f800000 error=overflow
+func=expm1f op1=43000001 result=7f800000 error=overflow
+func=expm1f op1=c2a80000 result=bf800000.000 error=0
+func=expm1f op1=c2a80001 result=bf800000.000 error=0
+
+; Check values for which exp goes denormal. expm1f should not report
+; spurious overflow.
+func=expm1f op1=c2b00f34 result=bf800000.000 error=0
+func=expm1f op1=c2ce8ed0 result=bf800000.000 error=0
+func=expm1f op1=c2dc6bba result=bf800000.000 error=0
+
+; Regression tests for significance loss when the two components of
+; the result have opposite sign but similar magnitude
+func=expm1f op1=be8516c1 result=be6a652b.0dc error=0
+func=expm1f op1=be851714 result=be6a65ab.0e5 error=0
+func=expm1f op1=be851cc7 result=be6a6e75.111 error=0
+func=expm1f op1=be851d1a result=be6a6ef5.102 error=0
+func=expm1f op1=be851d6d result=be6a6f75.0f2 error=0
+func=expm1f op1=be852065 result=be6a7409.0e4 error=0
+func=expm1f op1=be8520b8 result=be6a7489.0c7 error=0
+func=expm1f op1=be85210b result=be6a7509.0a8 error=0
+func=expm1f op1=be855401 result=be6ac39b.0d5 error=0
+func=expm1f op1=be933307 result=be7fdbf0.d8d error=0
+func=expm1f op1=be92ed6b result=be7f737a.d81 error=0
+func=expm1f op1=be933b90 result=be7fe8be.d76 error=0
+func=expm1f op1=3eb11364 result=3ed38deb.0c0 error=0
+func=expm1f op1=3f28e830 result=3f6f344b.0da error=0
+func=expm1f op1=3eb1578f result=3ed3ee47.13b error=0
+func=expm1f op1=3f50176a result=3fa08e36.fea error=0
diff --git a/pl/math/test/testcases/directed/log10.tst b/pl/math/test/testcases/directed/log10.tst
new file mode 100644
index 0000000..3483143
--- /dev/null
+++ b/pl/math/test/testcases/directed/log10.tst
@@ -0,0 +1,16 @@
+; log10.tst
+;
+; Copyright (c) 2007-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=log10 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=log10 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=log10 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=log10 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=log10 op1=fff02000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=log10 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+func=log10 op1=3ff00000.00000000 result=00000000.00000000 errno=0
+func=log10 op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=log10 op1=00000000.00000000 result=fff00000.00000000 errno=ERANGE status=z
+func=log10 op1=80000000.00000000 result=fff00000.00000000 errno=ERANGE status=z
+func=log10 op1=80000000.00000001 result=7ff80000.00000001 errno=EDOM status=i
diff --git a/pl/math/test/testcases/directed/log10f.tst b/pl/math/test/testcases/directed/log10f.tst
new file mode 100644
index 0000000..d5744a6
--- /dev/null
+++ b/pl/math/test/testcases/directed/log10f.tst
@@ -0,0 +1,69 @@
+; log10f.tst
+;
+; Copyright (c) 2007-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=log10f op1=7fc00001 result=7fc00001 errno=0
+func=log10f op1=ffc00001 result=7fc00001 errno=0
+func=log10f op1=7f800001 result=7fc00001 errno=0 status=i
+func=log10f op1=ff800001 result=7fc00001 errno=0 status=i
+func=log10f op1=ff810000 result=7fc00001 errno=0 status=i
+func=log10f op1=7f800000 result=7f800000 errno=0
+func=log10f op1=3f800000 result=00000000 errno=0
+func=log10f op1=ff800000 result=7fc00001 errno=EDOM status=i
+func=log10f op1=00000000 result=ff800000 errno=ERANGE status=z
+func=log10f op1=80000000 result=ff800000 errno=ERANGE status=z
+func=log10f op1=80000001 result=7fc00001 errno=EDOM status=i
+
+; Directed tests for the special-case handling of log10 of things
+; very near 1
+func=log10f op1=3f81a618 result=3bb62472.b92 error=0
+func=log10f op1=3f876783 result=3cc811f4.26c error=0
+func=log10f op1=3f816af8 result=3b9cc4c7.057 error=0
+func=log10f op1=3f7bed7d result=bbe432cb.e23 error=0
+func=log10f op1=3f803ece result=3a59ff3a.a84 error=0
+func=log10f op1=3f80089f result=38ef9728.aa6 error=0
+func=log10f op1=3f86ab72 result=3cb4b711.457 error=0
+func=log10f op1=3f780854 result=bc60f953.904 error=0
+func=log10f op1=3f7c6d76 result=bbc7fd01.01c error=0
+func=log10f op1=3f85dff6 result=3c9fa76f.81f error=0
+func=log10f op1=3f7b87f4 result=bbfa9edc.be4 error=0
+func=log10f op1=3f81c710 result=3bc4457b.745 error=0
+func=log10f op1=3f80946d result=3b00a140.c06 error=0
+func=log10f op1=3f7e87ea result=bb23cd70.828 error=0
+func=log10f op1=3f811437 result=3b6ee960.b40 error=0
+func=log10f op1=3f858dcf result=3c971d9b.2ea error=0
+func=log10f op1=3f7f61a3 result=ba89b814.4e0 error=0
+func=log10f op1=3f82d642 result=3c1bfb8d.517 error=0
+func=log10f op1=3f80f3bc result=3b52ebe8.c75 error=0
+func=log10f op1=3f85eff9 result=3ca150d9.7e8 error=0
+func=log10f op1=3f843eb8 result=3c68263f.771 error=0
+func=log10f op1=3f78e691 result=bc481cf4.50a error=0
+func=log10f op1=3f87c56f result=3cd1b268.5e6 error=0
+func=log10f op1=3f83b711 result=3c4b94c5.918 error=0
+func=log10f op1=3f823b2b result=3bf5eb02.e2a error=0
+func=log10f op1=3f7f2c4e result=bab82c80.519 error=0
+func=log10f op1=3f83fc92 result=3c5a3ba1.543 error=0
+func=log10f op1=3f793956 result=bc3ee04e.03c error=0
+func=log10f op1=3f839ba5 result=3c45caca.92a error=0
+func=log10f op1=3f862f30 result=3ca7de76.16f error=0
+func=log10f op1=3f832a20 result=3c2dc6e9.afd error=0
+func=log10f op1=3f810296 result=3b5fb92a.429 error=0
+func=log10f op1=3f7e58c9 result=bb38655a.0a4 error=0
+func=log10f op1=3f8362e7 result=3c39cc65.d15 error=0
+func=log10f op1=3f7fdb85 result=b97d9016.40b error=0
+func=log10f op1=3f84484e result=3c6a29f2.f74 error=0
+func=log10f op1=3f861862 result=3ca5819e.f2d error=0
+func=log10f op1=3f7c027b result=bbdf912d.440 error=0
+func=log10f op1=3f867803 result=3caf6744.34d error=0
+func=log10f op1=3f789a89 result=bc509bce.458 error=0
+func=log10f op1=3f8361d9 result=3c399347.379 error=0
+func=log10f op1=3f7d3ac3 result=bb9ad93a.93d error=0
+func=log10f op1=3f7ee241 result=baf8bd12.a62 error=0
+func=log10f op1=3f83a1fd result=3c4721bd.0a4 error=0
+func=log10f op1=3f840da3 result=3c5dd375.675 error=0
+func=log10f op1=3f79c2fe result=bc2f8a60.8c5 error=0
+func=log10f op1=3f854a93 result=3c901cc9.add error=0
+func=log10f op1=3f87a50a result=3cce6125.cd6 error=0
+func=log10f op1=3f818bf5 result=3baaee68.a55 error=0
+func=log10f op1=3f830a44 result=3c2705c4.d87 error=0
diff --git a/pl/math/test/testcases/directed/log1p.tst b/pl/math/test/testcases/directed/log1p.tst
new file mode 100644
index 0000000..9ee8c62
--- /dev/null
+++ b/pl/math/test/testcases/directed/log1p.tst
@@ -0,0 +1,22 @@
+; log1p.tst
+;
+; Copyright (c) 2009-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=log1p op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=log1p op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=log1p op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=log1p op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=log1p op1=fff02000.00000000 result=7ff80000.00000001 errno=0 status=i
+func=log1p op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+; Cases 6, 9 , 10, 11, 12 fail with certain versions of GLIBC and not others.
+; The main reason seems to be the handling of errno and exceptions.
+
+func=log1p op1=00000000.00000000 result=00000000.00000000 errno=0
+func=log1p op1=80000000.00000000 result=80000000.00000000 errno=0
+
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=log1p op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux
+func=log1p op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux
diff --git a/pl/math/test/testcases/directed/log1pf.tst b/pl/math/test/testcases/directed/log1pf.tst
new file mode 100644
index 0000000..aaa01d6
--- /dev/null
+++ b/pl/math/test/testcases/directed/log1pf.tst
@@ -0,0 +1,130 @@
+; log1pf.tst
+;
+; Copyright (c) 2009-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=log1pf op1=7fc00001 result=7fc00001 errno=0
+func=log1pf op1=ffc00001 result=7fc00001 errno=0
+func=log1pf op1=7f800001 result=7fc00001 errno=0 status=i
+func=log1pf op1=ff800001 result=7fc00001 errno=0 status=i
+func=log1pf op1=ff810000 result=7fc00001 errno=0 status=i
+func=log1pf op1=7f800000 result=7f800000 errno=0
+
+; Cases 6, 9 , 10, 11, 12 fail with certain versions of GLIBC and not others.
+; The main reason seems to be the handling of errno and exceptions.
+
+func=log1pf op1=00000000 result=00000000 errno=0
+func=log1pf op1=80000000 result=80000000 errno=0
+
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=log1pf op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=log1pf op1=80000001 result=80000001 errno=0 maybestatus=ux
+
+func=log1pf op1=3f1e91ee result=3ef6d127.fdb errno=0
+func=log1pf op1=3f201046 result=3ef8a881.fba errno=0
+func=log1pf op1=3f21b916 result=3efab23b.f9f errno=0
+func=log1pf op1=3f21bde6 result=3efab821.fee errno=0
+func=log1pf op1=3f22a5ee result=3efbd435.ff2 errno=0
+func=log1pf op1=3f231b56 result=3efc63b7.e26 errno=0
+func=log1pf op1=3f23ce96 result=3efd3e83.fc8 errno=0
+func=log1pf op1=3eee18c6 result=3ec38576.02e errno=0
+func=log1pf op1=3eee2f41 result=3ec394ce.057 errno=0
+func=log1pf op1=3eee770d result=3ec3c5cc.00c errno=0
+func=log1pf op1=3eee7fed result=3ec3cbda.065 errno=0
+func=log1pf op1=3eee8fb2 result=3ec3d69c.008 errno=0
+func=log1pf op1=3eeeb8eb result=3ec3f2ba.061 errno=0
+func=log1pf op1=3eeeccfd result=3ec4006a.01d errno=0
+func=log1pf op1=3eeef5f0 result=3ec41c56.020 errno=0
+func=log1pf op1=3eeeff12 result=3ec42290.00c errno=0
+func=log1pf op1=3eef05cf result=3ec42728.052 errno=0
+func=log1pf op1=3eef13d3 result=3ec430b6.00e errno=0
+func=log1pf op1=3eef2e70 result=3ec442da.04a errno=0
+func=log1pf op1=3eef3fbf result=3ec44ea6.055 errno=0
+func=log1pf op1=3eef3feb result=3ec44ec4.021 errno=0
+func=log1pf op1=3eef4399 result=3ec45146.011 errno=0
+func=log1pf op1=3eef452e result=3ec4525a.049 errno=0
+func=log1pf op1=3eef4ea9 result=3ec458d0.020 errno=0
+func=log1pf op1=3eef7365 result=3ec471d8.05e errno=0
+func=log1pf op1=3eefa38f result=3ec492a8.003 errno=0
+func=log1pf op1=3eefb1f1 result=3ec49c74.015 errno=0
+func=log1pf op1=3eefb334 result=3ec49d50.023 errno=0
+func=log1pf op1=3eefb3c1 result=3ec49db0.0bf errno=0
+func=log1pf op1=3eefb591 result=3ec49eec.15d errno=0
+func=log1pf op1=3eefd736 result=3ec4b5d6.02d errno=0
+func=log1pf op1=3eefd797 result=3ec4b618.114 errno=0
+func=log1pf op1=3eefee5d result=3ec4c59a.071 errno=0
+func=log1pf op1=3eeffff4 result=3ec4d194.0a7 errno=0
+func=log1pf op1=3ef00cd1 result=3ec4da56.025 errno=0
+func=log1pf op1=3ef0163a result=3ec4e0be.07a errno=0
+func=log1pf op1=3ef01e89 result=3ec4e666.007 errno=0
+func=log1pf op1=3ef02004 result=3ec4e768.00a errno=0
+func=log1pf op1=3ef02c40 result=3ec4efbc.017 errno=0
+func=log1pf op1=3ef05b50 result=3ec50fc4.031 errno=0
+func=log1pf op1=3ef05bb1 result=3ec51006.05f errno=0
+func=log1pf op1=3ef0651b result=3ec5166e.0d9 errno=0
+func=log1pf op1=3ef06609 result=3ec51710.02a errno=0
+func=log1pf op1=3ef0666a result=3ec51752.049 errno=0
+func=log1pf op1=3ef0791e result=3ec5240c.0a8 errno=0
+func=log1pf op1=3ef07d46 result=3ec526e0.00e errno=0
+func=log1pf op1=3ef091fd result=3ec534f8.03c errno=0
+func=log1pf op1=3ef09602 result=3ec537b4.128 errno=0
+func=log1pf op1=3ef09848 result=3ec53940.044 errno=0
+func=log1pf op1=3ef0a04f result=3ec53eb6.07d errno=0
+func=log1pf op1=3ef0ab6a result=3ec54644.062 errno=0
+func=log1pf op1=3ef0ae49 result=3ec54838.002 errno=0
+func=log1pf op1=3ef0c1b8 result=3ec55570.000 errno=0
+func=log1pf op1=3ef0ca06 result=3ec55b16.00d errno=0
+func=log1pf op1=3ef0cc29 result=3ec55c8a.095 errno=0
+func=log1pf op1=3ef0d228 result=3ec5609e.04f errno=0
+func=log1pf op1=3ef0d8c0 result=3ec5651a.05e errno=0
+func=log1pf op1=3ef0dc0c result=3ec56758.029 errno=0
+func=log1pf op1=3ef0e0e8 result=3ec56aa6.02e errno=0
+func=log1pf op1=3ef0e502 result=3ec56d70.102 errno=0
+func=log1pf op1=3ef0e754 result=3ec56f04.017 errno=0
+func=log1pf op1=3ef0efe9 result=3ec574da.01c errno=0
+func=log1pf op1=3ef0f309 result=3ec576fa.016 errno=0
+func=log1pf op1=3ef0f499 result=3ec5780a.005 errno=0
+func=log1pf op1=3ef0f6c2 result=3ec57982.083 errno=0
+func=log1pf op1=3ef0f852 result=3ec57a92.05d errno=0
+func=log1pf op1=3ef0f9e2 result=3ec57ba2.02e errno=0
+func=log1pf op1=3ef119ee result=3ec5916c.024 errno=0
+func=log1pf op1=3ef11edf result=3ec594c8.03d errno=0
+func=log1pf op1=3ef128c4 result=3ec59b82.001 errno=0
+func=log1pf op1=3ef12ac1 result=3ec59cdc.04b errno=0
+func=log1pf op1=3ef12fea result=3ec5a05e.045 errno=0
+func=log1pf op1=3ef131e7 result=3ec5a1b8.05a errno=0
+func=log1pf op1=3ef134e1 result=3ec5a3be.00e errno=0
+func=log1pf op1=3ef1397a result=3ec5a6de.127 errno=0
+func=log1pf op1=3ef13ade result=3ec5a7d0.0f6 errno=0
+func=log1pf op1=3ef13c0d result=3ec5a89e.054 errno=0
+func=log1pf op1=3ef13d71 result=3ec5a990.016 errno=0
+func=log1pf op1=3ef14074 result=3ec5ab9c.12c errno=0
+func=log1pf op1=3ef146a0 result=3ec5afce.035 errno=0
+func=log1pf op1=3ef14a39 result=3ec5b240.024 errno=0
+func=log1pf op1=3ef14d39 result=3ec5b44a.00c errno=0
+func=log1pf op1=3ef152a3 result=3ec5b7f8.04d errno=0
+func=log1pf op1=3ef170a1 result=3ec5cc5a.021 errno=0
+func=log1pf op1=3ef17855 result=3ec5d196.0dc errno=0
+func=log1pf op1=3ef17ece result=3ec5d5fc.010 errno=0
+func=log1pf op1=3ef1810c result=3ec5d782.08e errno=0
+func=log1pf op1=3ef18da9 result=3ec5e014.0ae errno=0
+func=log1pf op1=3ef19054 result=3ec5e1e4.1a2 errno=0
+func=log1pf op1=3ef190ea result=3ec5e24a.048 errno=0
+func=log1pf op1=3ef1a739 result=3ec5f172.0d8 errno=0
+func=log1pf op1=3ef1a83c result=3ec5f222.018 errno=0
+func=log1pf op1=3ef1bbcc result=3ec5ff6c.09d errno=0
+func=log1pf op1=3ef1bd3c result=3ec60066.03a errno=0
+func=log1pf op1=3ef1d6ee result=3ec611da.056 errno=0
+func=log1pf op1=3ef1de36 result=3ec616cc.01b errno=0
+func=log1pf op1=3ef1e623 result=3ec61c2e.008 errno=0
+func=log1pf op1=3ef1e9b1 result=3ec61e98.029 errno=0
+func=log1pf op1=3ef1ee19 result=3ec62196.0d8 errno=0
+func=log1pf op1=3ef1f13a result=3ec623b6.039 errno=0
+func=log1pf op1=3ef1f1a7 result=3ec62400.091 errno=0
+func=log1pf op1=3ef1f214 result=3ec6244a.0e8 errno=0
+func=log1pf op1=3ef206e1 result=3ec6326a.09b errno=0
+func=log1pf op1=3ef21245 result=3ec63a26.012 errno=0
+func=log1pf op1=3ef217fd result=3ec63e08.048 errno=0
+func=log1pf op1=3ef2186a result=3ec63e52.063 errno=0
diff --git a/pl/math/test/testcases/directed/log2.tst b/pl/math/test/testcases/directed/log2.tst
new file mode 100644
index 0000000..5d1eb9b
--- /dev/null
+++ b/pl/math/test/testcases/directed/log2.tst
@@ -0,0 +1,21 @@
+; Directed test cases for log2
+;
+; Copyright (c) 2018-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=log2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=log2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=log2 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=log2 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=log2 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+func=log2 op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=log2 op1=7fefffff.ffffffff result=408fffff.ffffffff.ffa errno=0
+func=log2 op1=ffefffff.ffffffff result=7ff80000.00000001 errno=EDOM status=i
+func=log2 op1=3ff00000.00000000 result=00000000.00000000 errno=0
+func=log2 op1=bff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=log2 op1=00000000.00000000 result=fff00000.00000000 errno=ERANGE status=z
+func=log2 op1=80000000.00000000 result=fff00000.00000000 errno=ERANGE status=z
+func=log2 op1=00000000.00000001 result=c090c800.00000000 errno=0
+func=log2 op1=80000000.00000001 result=7ff80000.00000001 errno=EDOM status=i
+func=log2 op1=40000000.00000000 result=3ff00000.00000000 errno=0
+func=log2 op1=3fe00000.00000000 result=bff00000.00000000 errno=0
diff --git a/pl/math/test/testcases/directed/log2f.tst b/pl/math/test/testcases/directed/log2f.tst
new file mode 100644
index 0000000..4e08110
--- /dev/null
+++ b/pl/math/test/testcases/directed/log2f.tst
@@ -0,0 +1,27 @@
+; log2f.tst - Directed test cases for log2f
+;
+; Copyright (c) 2017-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=log2f op1=7fc00001 result=7fc00001 errno=0
+func=log2f op1=ffc00001 result=7fc00001 errno=0
+func=log2f op1=7f800001 result=7fc00001 errno=0 status=i
+func=log2f op1=ff800001 result=7fc00001 errno=0 status=i
+func=log2f op1=ff810000 result=7fc00001 errno=0 status=i
+func=log2f op1=7f800000 result=7f800000 errno=0
+func=log2f op1=ff800000 result=7fc00001 errno=EDOM status=i
+func=log2f op1=3f800000 result=00000000 errno=0
+func=log2f op1=00000000 result=ff800000 errno=ERANGE status=z
+func=log2f op1=80000000 result=ff800000 errno=ERANGE status=z
+func=log2f op1=80000001 result=7fc00001 errno=EDOM status=i
+
+func=log2f op1=3f7d70a4 result=bc6d8f8b.7d4 error=0
+func=log2f op1=3f604189 result=be4394c8.395 error=0
+func=log2f op1=3f278034 result=bf1caa73.88e error=0
+func=log2f op1=3edd3c36 result=bf9af3b9.619 error=0
+func=log2f op1=3e61259a result=c00bdb95.650 error=0
+func=log2f op1=3f8147ae result=3c6b3267.d6a error=0
+func=log2f op1=3f8fbe77 result=3e2b5fe2.a1c error=0
+func=log2f op1=3fac3eea result=3edb4d5e.1fc error=0
+func=log2f op1=3fd6e632 result=3f3f5d3a.827 error=0
+func=log2f op1=40070838 result=3f89e055.a0a error=0
diff --git a/pl/math/test/testcases/directed/sinh.tst b/pl/math/test/testcases/directed/sinh.tst
new file mode 100644
index 0000000..d6a3da8
--- /dev/null
+++ b/pl/math/test/testcases/directed/sinh.tst
@@ -0,0 +1,21 @@
+; sinh.tst
+;
+; Copyright (c) 1999-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=sinh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=sinh op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=sinh op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=sinh op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=sinh op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+func=sinh op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox
+func=sinh op1=fff00000.00000000 result=fff00000.00000000 errno=0
+func=sinh op1=ffefffff.ffffffff result=fff00000.00000000 errno=ERANGE status=ox
+func=sinh op1=00000000.00000000 result=00000000.00000000 errno=0
+func=sinh op1=80000000.00000000 result=80000000.00000000 errno=0
+
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=sinh op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux
+func=sinh op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux
diff --git a/pl/math/test/testcases/directed/sinhf.tst b/pl/math/test/testcases/directed/sinhf.tst
new file mode 100644
index 0000000..5f7bd1b
--- /dev/null
+++ b/pl/math/test/testcases/directed/sinhf.tst
@@ -0,0 +1,21 @@
+; sinhf.tst
+;
+; Copyright (c) 2009-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=sinhf op1=7fc00001 result=7fc00001 errno=0
+func=sinhf op1=ffc00001 result=7fc00001 errno=0
+func=sinhf op1=7f800001 result=7fc00001 errno=0 status=i
+func=sinhf op1=ff800001 result=7fc00001 errno=0 status=i
+func=sinhf op1=7f800000 result=7f800000 errno=0
+func=sinhf op1=7f7fffff result=7f800000 errno=ERANGE status=ox
+func=sinhf op1=ff800000 result=ff800000 errno=0
+func=sinhf op1=ff7fffff result=ff800000 errno=ERANGE status=ox
+func=sinhf op1=00000000 result=00000000 errno=0
+func=sinhf op1=80000000 result=80000000 errno=0
+
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=sinhf op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=sinhf op1=80000001 result=80000001 errno=0 maybestatus=ux
diff --git a/pl/math/test/testcases/directed/tanf.tst b/pl/math/test/testcases/directed/tanf.tst
new file mode 100644
index 0000000..3161f70
--- /dev/null
+++ b/pl/math/test/testcases/directed/tanf.tst
@@ -0,0 +1,25 @@
+; tanf.tst
+;
+; Copyright (c) 2022-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=tanf op1=7fc00001 result=7fc00001 errno=0
+func=tanf op1=ffc00001 result=7fc00001 errno=0
+func=tanf op1=7f800001 result=7fc00001 errno=0 status=i
+func=tanf op1=ff800001 result=7fc00001 errno=0 status=i
+func=tanf op1=7f800000 result=7fc00001 errno=EDOM status=i
+func=tanf op1=ff800000 result=7fc00001 errno=EDOM status=i
+func=tanf op1=00000000 result=00000000 errno=0
+func=tanf op1=80000000 result=80000000 errno=0
+; SDCOMP-26094: check tanf in the cases for which the range reducer
+; returns values furthest beyond its nominal upper bound of pi/4.
+func=tanf op1=46427f1b result=3f80396d.599 error=0
+func=tanf op1=4647e568 result=3f8039a6.c9f error=0
+func=tanf op1=46428bac result=3f803a03.148 error=0
+func=tanf op1=4647f1f9 result=3f803a3c.852 error=0
+func=tanf op1=4647fe8a result=3f803ad2.410 error=0
+func=tanf op1=45d8d7f1 result=bf800669.901 error=0
+func=tanf op1=45d371a4 result=bf800686.3cd error=0
+func=tanf op1=45ce0b57 result=bf8006a2.e9a error=0
+func=tanf op1=45d35882 result=bf80071b.bc4 error=0
+func=tanf op1=45cdf235 result=bf800738.693 error=0
diff --git a/pl/math/test/testcases/directed/tanh.tst b/pl/math/test/testcases/directed/tanh.tst
new file mode 100644
index 0000000..78776e6
--- /dev/null
+++ b/pl/math/test/testcases/directed/tanh.tst
@@ -0,0 +1,18 @@
+; tanh.tst
+;
+; Copyright (c) 1999-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=tanh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=tanh op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=tanh op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=tanh op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=tanh op1=7ff00000.00000000 result=3ff00000.00000000 errno=0
+func=tanh op1=fff00000.00000000 result=bff00000.00000000 errno=0
+func=tanh op1=00000000.00000000 result=00000000.00000000 errno=0
+func=tanh op1=80000000.00000000 result=80000000.00000000 errno=0
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=tanh op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux
+func=tanh op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux
diff --git a/pl/math/test/testcases/directed/tanhf.tst b/pl/math/test/testcases/directed/tanhf.tst
new file mode 100644
index 0000000..603e310
--- /dev/null
+++ b/pl/math/test/testcases/directed/tanhf.tst
@@ -0,0 +1,20 @@
+; tanhf.tst
+;
+; Copyright (c) 2007-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=tanhf op1=7fc00001 result=7fc00001 errno=0
+func=tanhf op1=ffc00001 result=7fc00001 errno=0
+func=tanhf op1=7f800001 result=7fc00001 errno=0 status=i
+func=tanhf op1=ff800001 result=7fc00001 errno=0 status=i
+func=tanhf op1=7f800000 result=3f800000 errno=0
+func=tanhf op1=ff800000 result=bf800000 errno=0
+func=tanhf op1=00000000 result=00000000 errno=0
+func=tanhf op1=80000000 result=80000000 errno=0
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+; func=tanhf op1=00000001 result=00000001 errno=0 maybestatus=ux
+; func=tanhf op1=80000001 result=80000001 errno=0 maybestatus=ux
+func=tanhf op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=tanhf op1=80000001 result=80000001 errno=0 maybestatus=ux
diff --git a/pl/math/test/testcases/random/double.tst b/pl/math/test/testcases/random/double.tst
new file mode 100644
index 0000000..d83283e
--- /dev/null
+++ b/pl/math/test/testcases/random/double.tst
@@ -0,0 +1,6 @@
+!! double.tst - Random test case specification for DP functions
+!!
+!! Copyright (c) 1999-2023, Arm Limited.
+!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+test log10 10000
diff --git a/pl/math/test/testcases/random/float.tst b/pl/math/test/testcases/random/float.tst
new file mode 100644
index 0000000..fa77efe
--- /dev/null
+++ b/pl/math/test/testcases/random/float.tst
@@ -0,0 +1,8 @@
+!! float.tst - Random test case specification for SP functions
+!!
+!! Copyright (c) 2022-2023, Arm Limited.
+!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+test erff 10000
+test log10f 10000
+test tanf 10000
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
new file mode 100644
index 0000000..5e3133e
--- /dev/null
+++ b/pl/math/test/ulp_funcs.h
@@ -0,0 +1,66 @@
+/*
+ * Function entries for ulp.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifdef __vpcs
+
+#define _ZVF1(f) SF1 (f) VF1 (f) ZVNF1 (f)
+#define _ZVD1(f) SD1 (f) VD1 (f) ZVND1 (f)
+#define _ZVF2(f) SF2 (f) VF2 (f) ZVNF2 (f)
+#define _ZVD2(f) SD2 (f) VD2 (f) ZVND2 (f)
+
+#elif __aarch64
+
+#define _ZVF1(f) SF1 (f) VF1 (f)
+#define _ZVD1(f) SD1 (f) VD1 (f)
+#define _ZVF2(f) SF2 (f) VF2 (f)
+#define _ZVD2(f) SD2 (f) VD2 (f)
+
+#elif WANT_VMATH
+
+#define _ZVF1(f) SF1 (f)
+#define _ZVD1(f) SD1 (f)
+#define _ZVF2(f) SF2 (f)
+#define _ZVD2(f) SD2 (f)
+
+#else
+
+#define _ZVF1(f)
+#define _ZVD1(f)
+#define _ZVF2(f)
+#define _ZVD2(f)
+
+#endif
+
+#if WANT_SVE_MATH
+
+#define _ZSVF1(f) SVF1 (f) ZSVF1 (f)
+#define _ZSVF2(f) SVF2 (f) ZSVF2 (f)
+#define _ZSVD1(f) SVD1 (f) ZSVD1 (f)
+#define _ZSVD2(f) SVD2 (f) ZSVD2 (f)
+
+#else
+
+#define _ZSVF1(f)
+#define _ZSVF2(f)
+#define _ZSVD1(f)
+#define _ZSVD2(f)
+
+#endif
+
+#define _ZSF1(f) F1 (f)
+#define _ZSF2(f) F2 (f)
+#define _ZSD1(f) D1 (f)
+#define _ZSD2(f) D2 (f)
+
+#include "ulp_funcs_gen.h"
+
+#if WANT_SVE_MATH
+F (__sv_powi, sv_powi, ref_powi, mpfr_powi, 2, 0, d2, 0)
+F (_ZGVsMxvv_powk, Z_sv_powk, ref_powi, mpfr_powi, 2, 0, d2, 0)
+F (__sv_powif, sv_powif, ref_powif, mpfr_powi, 2, 1, f2, 0)
+F (_ZGVsMxvv_powi, Z_sv_powi, ref_powif, mpfr_powi, 2, 1, f2, 0)
+#endif
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
new file mode 100644
index 0000000..b682e93
--- /dev/null
+++ b/pl/math/test/ulp_wrappers.h
@@ -0,0 +1,148 @@
+// clang-format off
+/*
+ * Function wrappers for ulp.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include <stdbool.h>
+
+#if USE_MPFR
+static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) {
+  mpfr_cos(y, x, r);
+  return mpfr_sin(y, x, r);
+}
+static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) {
+  mpfr_sin(y, x, r);
+  return mpfr_cos(y, x, r);
+}
+static int wrap_mpfr_powi(mpfr_t ret, const mpfr_t x, const mpfr_t y, mpfr_rnd_t rnd) {
+  mpfr_t y2;
+  mpfr_init(y2);
+  mpfr_trunc(y2, y);
+  return mpfr_pow(ret, x, y2, rnd);
+}
+#endif
+
+/* Our implementations of powi/powk are too imprecise to verify
+   against any established pow implementation. Instead we have the
+   following simple implementation, against which it is enough to
+   maintain bitwise reproducibility. Note the test framework expects
+   the reference impl to be of higher precision than the function
+   under test. For instance this means that the reference for
+   double-precision powi will be passed a long double, so to check
+   bitwise reproducibility we have to cast it back down to
+   double. This is fine since a round-trip to higher precision and
+   back down is correctly rounded.  */
+#define DECL_POW_INT_REF(NAME, DBL_T, FLT_T, INT_T)                            \
+  static DBL_T NAME (DBL_T in_val, DBL_T y)                                    \
+  {                                                                            \
+    INT_T n = (INT_T) round (y);                                               \
+    FLT_T acc = 1.0;                                                           \
+    bool want_recip = n < 0;                                                   \
+    n = n < 0 ? -n : n;                                                        \
+                                                                               \
+    for (FLT_T c = in_val; n; c *= c, n >>= 1)                                 \
+      {                                                                        \
+        if (n & 0x1)                                                           \
+          {                                                                    \
+            acc *= c;                                                          \
+          }                                                                    \
+      }                                                                        \
+    if (want_recip)                                                            \
+      {                                                                        \
+        acc = 1.0 / acc;                                                       \
+      }                                                                        \
+    return acc;                                                                \
+  }
+
+DECL_POW_INT_REF(ref_powif, double, float, int)
+DECL_POW_INT_REF(ref_powi, long double, double, int)
+
+#define VF1_WRAP(func) static float v_##func##f(float x) { return __v_##func##f(argf(x))[0]; }
+#define VF2_WRAP(func) static float v_##func##f(float x, float y) { return __v_##func##f(argf(x), argf(y))[0]; }
+#define VD1_WRAP(func) static double v_##func(double x) { return __v_##func(argd(x))[0]; }
+#define VD2_WRAP(func) static double v_##func(double x, double y) { return __v_##func(argd(x), argd(y))[0]; }
+
+#define VNF1_WRAP(func) static float vn_##func##f(float x) { return __vn_##func##f(argf(x))[0]; }
+#define VNF2_WRAP(func) static float vn_##func##f(float x, float y) { return __vn_##func##f(argf(x), argf(y))[0]; }
+#define VND1_WRAP(func) static double vn_##func(double x) { return __vn_##func(argd(x))[0]; }
+#define VND2_WRAP(func) static double vn_##func(double x, double y) { return __vn_##func(argd(x), argd(y))[0]; }
+
+#define ZVF1_WRAP(func) static float Z_##func##f(float x) { return _ZGVnN4v_##func##f(argf(x))[0]; }
+#define ZVF2_WRAP(func) static float Z_##func##f(float x, float y) { return _ZGVnN4vv_##func##f(argf(x), argf(y))[0]; }
+#define ZVD1_WRAP(func) static double Z_##func(double x) { return _ZGVnN2v_##func(argd(x))[0]; }
+#define ZVD2_WRAP(func) static double Z_##func(double x, double y) { return _ZGVnN2vv_##func(argd(x), argd(y))[0]; }
+
+#ifdef __vpcs
+
+#define ZVNF1_WRAP(func) VF1_WRAP(func) VNF1_WRAP(func) ZVF1_WRAP(func)
+#define ZVNF2_WRAP(func) VF2_WRAP(func) VNF2_WRAP(func) ZVF2_WRAP(func)
+#define ZVND1_WRAP(func) VD1_WRAP(func) VND1_WRAP(func) ZVD1_WRAP(func)
+#define ZVND2_WRAP(func) VD2_WRAP(func) VND2_WRAP(func) ZVD2_WRAP(func)
+
+#elif __aarch64__
+
+#define ZVNF1_WRAP(func) VF1_WRAP(func) VNF1_WRAP(func)
+#define ZVNF2_WRAP(func) VF2_WRAP(func) VNF2_WRAP(func)
+#define ZVND1_WRAP(func) VD1_WRAP(func) VND1_WRAP(func)
+#define ZVND2_WRAP(func) VD2_WRAP(func) VND2_WRAP(func)
+
+#elif WANT_VMATH
+
+#define ZVNF1_WRAP(func) VF1_WRAP(func)
+#define ZVNF2_WRAP(func) VF2_WRAP(func)
+#define ZVND1_WRAP(func) VD1_WRAP(func)
+#define ZVND2_WRAP(func) VD2_WRAP(func)
+
+#else
+
+#define ZVNF1_WRAP(func)
+#define ZVNF2_WRAP(func)
+#define ZVND1_WRAP(func)
+#define ZVND2_WRAP(func)
+
+#endif
+
+#define SVF1_WRAP(func) static float sv_##func##f(float x) { return svretf(__sv_##func##f_x(svargf(x), svptrue_b32())); }
+#define SVF2_WRAP(func) static float sv_##func##f(float x, float y) { return svretf(__sv_##func##f_x(svargf(x), svargf(y), svptrue_b32())); }
+#define SVD1_WRAP(func) static double sv_##func(double x) { return svretd(__sv_##func##_x(svargd(x), svptrue_b64())); }
+#define SVD2_WRAP(func) static double sv_##func(double x, double y) { return svretd(__sv_##func##_x(svargd(x), svargd(y), svptrue_b64())); }
+
+#define ZSVF1_WRAP(func) static float Z_sv_##func##f(float x) { return svretf(_ZGVsMxv_##func##f(svargf(x), svptrue_b32())); }
+#define ZSVF2_WRAP(func) static float Z_sv_##func##f(float x, float y) { return svretf(_ZGVsMxvv_##func##f(svargf(x), svargf(y), svptrue_b32())); }
+#define ZSVD1_WRAP(func) static double Z_sv_##func(double x) { return svretd(_ZGVsMxv_##func(svargd(x), svptrue_b64())); }
+#define ZSVD2_WRAP(func) static double Z_sv_##func(double x, double y) { return svretd(_ZGVsMxvv_##func(svargd(x), svargd(y), svptrue_b64())); }
+
+#if WANT_SVE_MATH
+
+#define ZSVNF1_WRAP(func) SVF1_WRAP(func) ZSVF1_WRAP(func)
+#define ZSVNF2_WRAP(func) SVF2_WRAP(func) ZSVF2_WRAP(func)
+#define ZSVND1_WRAP(func) SVD1_WRAP(func) ZSVD1_WRAP(func)
+#define ZSVND2_WRAP(func) SVD2_WRAP(func) ZSVD2_WRAP(func)
+
+#else
+
+#define ZSVNF1_WRAP(func)
+#define ZSVNF2_WRAP(func)
+#define ZSVND1_WRAP(func)
+#define ZSVND2_WRAP(func)
+
+#endif
+
+/* No wrappers for scalar routines, but PL_SIG will emit them.  */
+#define ZSNF1_WRAP(func)
+#define ZSNF2_WRAP(func)
+#define ZSND1_WRAP(func)
+#define ZSND2_WRAP(func)
+
+#include "ulp_wrappers_gen.h"
+
+#if WANT_SVE_MATH
+static float Z_sv_powi(float x, float y) { return svretf(_ZGVsMxvv_powi(svargf(x), svdup_n_s32((int)round(y)), svptrue_b32())); }
+static float sv_powif(float x, float y) { return svretf(__sv_powif_x(svargf(x), svdup_n_s32((int)round(y)), svptrue_b32())); }
+static double Z_sv_powk(double x, double y) { return svretd(_ZGVsMxvv_powk(svargd(x), svdup_n_s64((long)round(y)), svptrue_b64())); }
+static double sv_powi(double x, double y) { return svretd(__sv_powi_x(svargd(x), svdup_n_s64((long)round(y)), svptrue_b64())); }
+#endif
+// clang-format on
diff --git a/pl/math/tools/asinh.sollya b/pl/math/tools/asinh.sollya
new file mode 100644
index 0000000..663ee92
--- /dev/null
+++ b/pl/math/tools/asinh.sollya
@@ -0,0 +1,28 @@
+// polynomial for approximating asinh(x)
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+// Polynomial is used in [2^-26, 1]. However it is least accurate close to 1, so
+// we use 2^-6 as the lower bound for coeff generation, which yields sufficiently
+// accurate results in [2^-26, 2^-6].
+a = 0x1p-6;
+b = 1.0;
+
+f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2);
+
+approx = proc(poly, d) {
+  return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
+};
+
+poly = 0;
+for i from 0 to deg do {
+  i;
+  p = roundcoefficients(approx(poly,i), [|D ...|]);
+  poly = poly + x^i*coeff(p,0);
+};
+
+
+display = hexadecimal;
+print("coeffs:");
+for i from 0 to deg do coeff(poly,i);
diff --git a/pl/math/tools/asinhf.sollya b/pl/math/tools/asinhf.sollya
new file mode 100644
index 0000000..ab115b5
--- /dev/null
+++ b/pl/math/tools/asinhf.sollya
@@ -0,0 +1,29 @@
+// polynomial for approximating asinh(x)
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 9;
+
+a = 0x1.0p-12;
+b = 1.0;
+
+f = proc(y) {
+  return asinh(x);
+};
+
+approx = proc(poly, d) {
+  return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
+};
+
+poly = x;
+for i from 2 to deg do {
+  p = roundcoefficients(approx(poly,i), [|SG ...|]);
+  poly = poly + x^i*coeff(p,0);
+};
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 2 to deg do coeff(poly,i);
diff --git a/pl/math/tools/atan.sollya b/pl/math/tools/atan.sollya
new file mode 100644
index 0000000..ad4f33b
--- /dev/null
+++ b/pl/math/tools/atan.sollya
@@ -0,0 +1,23 @@
+// polynomial for approximating atan(x) and atan2(y, x)
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+// atan is odd, so approximate with an odd polynomial:
+// x + ax^3 + bx^5 + cx^7 + ...
+// We generate a, b, c, ... such that we can approximate atan(x) by:
+// x + x^3 * (a + bx^2 + cx^4 + ...)
+
+// Assemble monomials
+deg = 20;
+mons = [|1,...,deg|];
+for i from 0 to deg-1 do mons[i] = mons[i] * 2 + 1;
+
+a = 0x1.0p-1022;
+b = 1;
+
+poly = fpminimax(atan(x)-x, mons, [|double ...|], [a;b]);
+
+display = hexadecimal;
+print("coeffs:");
+for i from 0 to deg-1 do coeff(poly,mons[i]);
diff --git a/pl/math/tools/atanf.sollya b/pl/math/tools/atanf.sollya
new file mode 100644
index 0000000..ed88d0b
--- /dev/null
+++ b/pl/math/tools/atanf.sollya
@@ -0,0 +1,20 @@
+// polynomial for approximating atanf(x)
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+// Generate list of monomials:
+// Taylor series of atan is of the form x + ax^3 + bx^5 + cx^7 + ...
+// So generate a, b, c, ... such that we can approximate atan(x) by:
+// x + x^3 * (a + bx^2 + cx^4 + ...)
+
+deg = 7;
+
+a = 1.1754943508222875e-38;
+b = 1;
+
+poly = fpminimax((atan(sqrt(x))-sqrt(x))/x^(3/2), deg, [|single ...|], [a;b]);
+
+display = hexadecimal;
+print("coeffs:");
+for i from 0 to deg do coeff(poly,i);
diff --git a/pl/math/tools/cbrt.sollya b/pl/math/tools/cbrt.sollya
new file mode 100644
index 0000000..1d43dc7
--- /dev/null
+++ b/pl/math/tools/cbrt.sollya
@@ -0,0 +1,20 @@
+// polynomial for approximating cbrt(x) in double precision
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 3;
+
+a = 0.5;
+b = 1;
+
+
+f = x^(1/3);
+
+poly = fpminimax(f, deg, [|double ...|], [a;b]);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do round(coeff(poly,i), D, RN);
diff --git a/pl/math/tools/cbrtf.sollya b/pl/math/tools/cbrtf.sollya
new file mode 100644
index 0000000..4e0cc69
--- /dev/null
+++ b/pl/math/tools/cbrtf.sollya
@@ -0,0 +1,20 @@
+// polynomial for approximating cbrt(x) in single precision
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 3;
+
+a = 0.5;
+b = 1;
+
+
+f = x^(1/3);
+
+poly = fpminimax(f, deg, [|single ...|], [a;b]);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do round(coeff(poly,i), SG, RN);
diff --git a/pl/math/tools/erfc.sollya b/pl/math/tools/erfc.sollya
new file mode 100644
index 0000000..8c40b4b
--- /dev/null
+++ b/pl/math/tools/erfc.sollya
@@ -0,0 +1,23 @@
+// polynomial for approximating erfc(x)*exp(x*x)
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 12; // poly degree
+
+// interval bounds
+a = 0x1.60dfc14636e2ap0;
+b = 0x1.d413cccfe779ap0;
+
+f = proc(y) {
+  t = y + a;
+  return erfc(t) * exp(t*t);
+};
+
+poly = remez(f(x), deg, [0;b-a], 1, 1e-16);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do round(coeff(poly,i), 52, RN);
diff --git a/pl/math/tools/erfcf.sollya b/pl/math/tools/erfcf.sollya
new file mode 100644
index 0000000..69c6836
--- /dev/null
+++ b/pl/math/tools/erfcf.sollya
@@ -0,0 +1,31 @@
+// polynomial for approximating erfc(x)*exp(x*x)
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 15; // poly degree
+
+// interval bounds
+a = 0x1.0p-26;
+b = 2;
+
+f = proc(y) {
+  return erfc(y) * exp(y*y);
+};
+
+approx = proc(poly, d) {
+  return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
+};
+
+poly = 0;
+for i from 0 to deg do {
+  p = roundcoefficients(approx(poly,i), [|D ...|]);
+  poly = poly + x^i*coeff(p,0);
+  print(i);
+};
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do coeff(poly,i);
diff --git a/pl/math/tools/expm1.sollya b/pl/math/tools/expm1.sollya
new file mode 100644
index 0000000..7b6f324
--- /dev/null
+++ b/pl/math/tools/expm1.sollya
@@ -0,0 +1,21 @@
+// polynomial for approximating exp(x)-1 in double precision
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 12;
+
+a = -log(2)/2;
+b = log(2)/2;
+
+f = proc(y) {
+  return exp(y)-1;
+};
+
+poly = fpminimax(f(x), deg, [|double ...|], [a;b]);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 2 to deg do round(coeff(poly,i), D, RN);
diff --git a/pl/math/tools/expm1f.sollya b/pl/math/tools/expm1f.sollya
new file mode 100644
index 0000000..efdf1bd
--- /dev/null
+++ b/pl/math/tools/expm1f.sollya
@@ -0,0 +1,21 @@
+// polynomial for approximating exp(x)-1 in single precision
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 5;
+
+a = -log(2)/2;
+b = log(2)/2;
+
+f = proc(y) {
+  return exp(y)-1;
+};
+
+poly = fpminimax(f(x), deg, [|single ...|], [a;b]);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 2 to deg do round(coeff(poly,i), SG, RN);
diff --git a/pl/math/tools/log10.sollya b/pl/math/tools/log10.sollya
new file mode 100644
index 0000000..85d1d15
--- /dev/null
+++ b/pl/math/tools/log10.sollya
@@ -0,0 +1,44 @@
+// polynomial for approximating log10(1+x)
+//
+// Copyright (c) 2019-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 6; // poly degree
+// |log10(1+x)| > 0x1p-5 outside the interval
+a = -0x1.p-5;
+b = 0x1.p-5;
+
+ln10 = evaluate(log(10),0);
+invln10hi = double(1/ln10 + 0x1p21) - 0x1p21; // round away last 21 bits
+invln10lo = double(1/ln10 - invln10hi);
+
+// find log10(1+x)/x polynomial with minimal relative error
+// (minimal relative error polynomial for log10(1+x) is the same * x)
+deg = deg-1; // because of /x
+
+// f = log(1+x)/x; using taylor series
+f = 0;
+for i from 0 to 60 do { f = f + (-x)^i/(i+1); };
+f = f/ln10;
+
+// return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)|
+approx = proc(poly,d) {
+  return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
+};
+
+// first coeff is fixed, iteratively find optimal double prec coeffs
+poly = invln10hi + invln10lo;
+for i from 1 to deg do {
+  p = roundcoefficients(approx(poly,i), [|D ...|]);
+  poly = poly + x^i*coeff(p,0);
+};
+display = hexadecimal;
+print("invln10hi:", invln10hi);
+print("invln10lo:", invln10lo);
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do coeff(poly,i);
+
+display = decimal;
+print("in [",a,b,"]");
diff --git a/pl/math/tools/log10f.sollya b/pl/math/tools/log10f.sollya
new file mode 100644
index 0000000..94bf32f
--- /dev/null
+++ b/pl/math/tools/log10f.sollya
@@ -0,0 +1,37 @@
+// polynomial for approximating log10f(1+x)
+//
+// Copyright (c) 2019-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+// Computation of log10f(1+x) will be carried out in double precision
+
+deg = 4; // poly degree
+// [OFF; 2*OFF] is divided in 2^4 intervals with OFF~0.7
+a = -0.04375;
+b = 0.04375;
+
+// find log(1+x)/x polynomial with minimal relative error
+// (minimal relative error polynomial for log(1+x) is the same * x)
+deg = deg-1; // because of /x
+
+// f = log(1+x)/x; using taylor series
+f = 0;
+for i from 0 to 60 do { f = f + (-x)^i/(i+1); };
+
+// return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)|
+approx = proc(poly,d) {
+  return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
+};
+
+// first coeff is fixed, iteratively find optimal double prec coeffs
+poly = 1;
+for i from 1 to deg do {
+  p = roundcoefficients(approx(poly,i), [|D ...|]);
+  poly = poly + x^i*coeff(p,0);
+};
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do double(coeff(poly,i));
diff --git a/pl/math/tools/log1p.sollya b/pl/math/tools/log1p.sollya
new file mode 100644
index 0000000..598a36a
--- /dev/null
+++ b/pl/math/tools/log1p.sollya
@@ -0,0 +1,30 @@
+// polynomial for approximating log(1+x) in double precision
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 20;
+
+a = sqrt(2)/2-1;
+b = sqrt(2)-1;
+
+f = proc(y) {
+  return log(1+y);
+};
+
+approx = proc(poly, d) {
+  return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
+};
+
+poly = x;
+for i from 2 to deg do {
+  p = roundcoefficients(approx(poly,i), [|D ...|]);
+  poly = poly + x^i*coeff(p,0);
+};
+
+
+print("coeffs:");
+display = hexadecimal;
+for i from 2 to deg do coeff(poly,i);
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
diff --git a/pl/math/tools/log1pf.sollya b/pl/math/tools/log1pf.sollya
new file mode 100644
index 0000000..cc1db10
--- /dev/null
+++ b/pl/math/tools/log1pf.sollya
@@ -0,0 +1,21 @@
+// polynomial for approximating log(1+x) in single precision
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 10;
+
+a = -0.25;
+b = 0.5;
+
+f = proc(y) {
+  return log(1+y);
+};
+
+poly = fpminimax(f(x), deg, [|single ...|], [a;b]);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 2 to deg do round(coeff(poly,i), SG, RN);
diff --git a/pl/math/tools/tan.sollya b/pl/math/tools/tan.sollya
new file mode 100644
index 0000000..bb0bb28
--- /dev/null
+++ b/pl/math/tools/tan.sollya
@@ -0,0 +1,20 @@
+// polynomial for approximating double precision tan(x)
+//
+// Copyright (c) 2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 8;
+
+// interval bounds
+a = 0x1.0p-126;
+b = pi / 8;
+
+display = hexadecimal;
+
+f = (tan(sqrt(x))-sqrt(x))/x^(3/2);
+poly = fpminimax(f, deg, [|double ...|], [a*a;b*b]);
+
+//print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do coeff(poly,i);
diff --git a/pl/math/tools/tanf.sollya b/pl/math/tools/tanf.sollya
new file mode 100644
index 0000000..f4b49b4
--- /dev/null
+++ b/pl/math/tools/tanf.sollya
@@ -0,0 +1,78 @@
+// polynomial for approximating single precision tan(x)
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+dtype = single;
+
+mthd = 0; // approximate tan
+deg = 5; // poly degree
+
+// // Uncomment for cotan
+// mthd = 1; // approximate cotan
+// deg = 3; // poly degree
+
+// interval bounds
+a = 0x1.0p-126;
+b = pi / 4;
+
+print("Print some useful constants");
+display = hexadecimal!;
+if (dtype==double) then { prec = 53!; }
+else if (dtype==single) then { prec = 23!; };
+
+print("pi/4");
+pi/4;
+
+// Setup precisions (display and computation)
+display = decimal!;
+prec=128!;
+save_prec=prec;
+
+//
+// Select function to approximate with Sollya
+//
+if(mthd==0) then {
+  s = "x + x^3 * P(x^2)";
+  g = tan(x);
+  F = proc(P) { return x + x^3 * P(x^2); };
+  f = (g(sqrt(x))-sqrt(x))/(x*sqrt(x));
+  init_poly = 0;
+  // Display info
+  print("Approximate g(x) =", g, "as F(x)=", s, ".");
+  poly = fpminimax(f, deg, [|dtype ...|], [a*a;b*b]);
+}
+else if (mthd==1) then {
+  s = "1/x + x * P(x^2)";
+  g = 1 / tan(x);
+  F = proc(P) { return 1/x + x * P(x^2); };
+  f = (g(sqrt(x))-1/sqrt(x))/(sqrt(x));
+  init_poly = 0;
+  deg_init_poly = -1; // a value such that we actually start by building constant coefficient
+  // Display info
+  print("Approximate g(x) =", g, "as F(x)=", s, ".");
+  // Fpminimax used to minimise absolute error
+  approx_fpminimax = proc(func, poly, d) {
+    return fpminimax(func - poly / x^-(deg-d), 0, [|dtype|], [a;b], absolute, floating);
+  };
+  // Optimise all coefficients at once
+  poly = fpminimax(f, [|0,...,deg|], [|dtype ...|], [a;b], absolute, floating);
+};
+
+
+//
+// Display coefficients in Sollya
+//
+display = hexadecimal!;
+if (dtype==double) then { prec = 53!; }
+else if (dtype==single) then { prec = 23!; };
+print("_coeffs :_ hex");
+for i from 0 to deg do coeff(poly, i);
+
+// Compute errors
+display = hexadecimal!;
+d_rel_err = dirtyinfnorm(1-F(poly)/g(x), [a;b]);
+d_abs_err = dirtyinfnorm(g(x)-F(poly), [a;b]);
+print("dirty rel error:", d_rel_err);
+print("dirty abs error:", d_abs_err);
+print("in [",a,b,"]");
diff --git a/pl/math/tools/v_erf.sollya b/pl/math/tools/v_erf.sollya
new file mode 100644
index 0000000..394ba37
--- /dev/null
+++ b/pl/math/tools/v_erf.sollya
@@ -0,0 +1,20 @@
+// polynomial for approximating erf(x).
+// To generate coefficients for interval i (0 to 47) do:
+// $ sollya v_erf.sollya $i
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+scale = 1/8;
+deg = 9;
+
+itv = parse(__argv[0]);
+if (itv == 0)  then { a = 0x1p-1022; }
+else                { a = itv * scale; };
+
+prec=256;
+
+poly = fpminimax(erf(scale*x+a), deg, [|D ...|], [0; 1]);
+
+display = hexadecimal;
+for i from 0 to deg do coeff(poly, i);
\ No newline at end of file
diff --git a/pl/math/tools/v_erfc.sollya b/pl/math/tools/v_erfc.sollya
new file mode 100644
index 0000000..3b03ba0
--- /dev/null
+++ b/pl/math/tools/v_erfc.sollya
@@ -0,0 +1,46 @@
+// polynomial for approximating erfc(x)*exp(x*x)
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 12; // poly degree
+
+itv = parse(__argv[0]);
+
+bounds = [|3.725290298461914e-9,
+           0.18920711500272103,
+           0.41421356237309515,
+           0.681792830507429,
+           1,
+           1.378414230005442,
+           1.8284271247461903,
+           2.363585661014858,
+           3,
+           3.756828460010884,
+           4.656854249492381,
+           5.727171322029716,
+           7,
+           8.513656920021768,
+           10.313708498984761,
+           12.454342644059432,
+           15,
+           18.027313840043536,
+           21.627416997969522,
+           25.908685288118864,
+           31|];
+
+a = bounds[itv];
+b = bounds[itv + 1];
+
+f = proc(y) {
+  t = y + a;
+  return erfc(t) * exp(t*t);
+};
+
+poly = fpminimax(f(x), deg, [|double ...|], [0;b-a]);
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do coeff(poly, i);
diff --git a/pl/math/tools/v_log10.sollya b/pl/math/tools/v_log10.sollya
new file mode 100644
index 0000000..e2df436
--- /dev/null
+++ b/pl/math/tools/v_log10.sollya
@@ -0,0 +1,38 @@
+// polynomial used for __v_log10(x)
+//
+// Copyright (c) 2019-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 6; // poly degree
+a = -0x1.fc1p-9;
+b = 0x1.009p-8;
+
+// find log(1+x)/x polynomial with minimal relative error
+// (minimal relative error polynomial for log(1+x) is the same * x)
+deg = deg-1; // because of /x
+
+// f = log(1+x)/x; using taylor series
+f = 0;
+for i from 0 to 60 do { f = f + (-x)^i/(i+1); };
+
+// return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)|
+approx = proc(poly,d) {
+  return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
+};
+
+// first coeff is fixed, iteratively find optimal double prec coeffs
+poly = 1;
+for i from 1 to deg do {
+  p = roundcoefficients(approx(poly,i), [|D ...|]);
+  poly = poly + x^i*coeff(p,0);
+};
+
+// scale coefficients by 1/ln(10)
+ln10 = evaluate(log(10),0);
+poly = poly/ln10;
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do double(coeff(poly,i));
diff --git a/pl/math/tools/v_log10f.sollya b/pl/math/tools/v_log10f.sollya
new file mode 100644
index 0000000..396d5a9
--- /dev/null
+++ b/pl/math/tools/v_log10f.sollya
@@ -0,0 +1,45 @@
+// polynomial for approximating v_log10f(1+x)
+//
+// Copyright (c) 2019-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 9; // poly degree
+// |log10(1+x)| > 0x1p-4 outside the interval
+a = -1/3;
+b =  1/3;
+
+display = hexadecimal;
+print("log10(2) = ", single(log10(2)));
+
+ln10 = evaluate(log(10),0);
+invln10 = single(1/ln10);
+
+// find log10(1+x)/x polynomial with minimal relative error
+// (minimal relative error polynomial for log10(1+x) is the same * x)
+deg = deg-1; // because of /x
+
+// f = log(1+x)/x; using taylor series
+f = 0;
+for i from 0 to 60 do { f = f + (-x)^i/(i+1); };
+f = f/ln10;
+
+// return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)|
+approx = proc(poly,d) {
+  return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
+};
+
+// first coeff is fixed, iteratively find optimal double prec coeffs
+poly = invln10;
+for i from 1 to deg do {
+  p = roundcoefficients(approx(poly,i), [|SG ...|]);
+  poly = poly + x^i*coeff(p,0);
+};
+display = hexadecimal;
+print("invln10:", invln10);
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do single(coeff(poly,i));
+
+display = decimal;
+print("in [",a,b,"]");
diff --git a/pl/math/tools/v_log2f.sollya b/pl/math/tools/v_log2f.sollya
new file mode 100644
index 0000000..99e050c
--- /dev/null
+++ b/pl/math/tools/v_log2f.sollya
@@ -0,0 +1,38 @@
+// polynomial used for __v_log2f(x)
+//
+// Copyright (c) 2022-2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 9; // poly degree
+a = -1/3;
+b = 1/3;
+
+ln2 = evaluate(log(2),0);
+invln2 = single(1/ln2);
+
+// find log2(1+x)/x polynomial with minimal relative error
+// (minimal relative error polynomial for log2(1+x) is the same * x)
+deg = deg-1; // because of /x
+
+// f = log2(1+x)/x; using taylor series
+f = 0;
+for i from 0 to 60 do { f = f + (-x)^i/(i+1); };
+f = f * invln2;
+
+// return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)|
+approx = proc(poly,d) {
+  return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
+};
+
+// first coeff is fixed, iteratively find optimal double prec coeffs
+poly = invln2;
+for i from 1 to deg do {
+  p = roundcoefficients(approx(poly,i), [|SG ...|]);
+  poly = poly + x^i*coeff(p,0);
+};
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do coeff(poly,i);
diff --git a/pl/math/v_acosh_3u5.c b/pl/math/v_acosh_3u5.c
new file mode 100644
index 0000000..22f69d7
--- /dev/null
+++ b/pl/math/v_acosh_3u5.c
@@ -0,0 +1,51 @@
+/*
+ * Single-precision vector acosh(x) function.
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define WANT_V_LOG1P_K0_SHORTCUT 1
+#include "v_log1p_inline.h"
+
+#define BigBoundTop 0x5fe /* top12 (asuint64 (0x1p511)).  */
+
+#if V_SUPPORTED
+
+static NOINLINE VPCS_ATTR v_f64_t
+special_case (v_f64_t x)
+{
+  return v_call_f64 (acosh, x, x, v_u64 (-1));
+}
+
+/* Vector approximation for double-precision acosh, based on log1p.
+   The largest observed error is 3.02 ULP in the region where the
+   argument to log1p falls in the k=0 interval, i.e. x close to 1:
+   __v_acosh(0x1.00798aaf80739p+0) got 0x1.f2d6d823bc9dfp-5
+				  want 0x1.f2d6d823bc9e2p-5.  */
+VPCS_ATTR v_f64_t V_NAME (acosh) (v_f64_t x)
+{
+  v_u64_t itop = v_as_u64_f64 (x) >> 52;
+  v_u64_t special = v_cond_u64 ((itop - OneTop) >= (BigBoundTop - OneTop));
+
+  /* Fall back to scalar routine for all lanes if any of them are special.  */
+  if (unlikely (v_any_u64 (special)))
+    return special_case (x);
+
+  v_f64_t xm1 = x - 1;
+  v_f64_t u = xm1 * (x + 1);
+  return log1p_inline (xm1 + v_sqrt_f64 (u));
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, acosh, 1.0, 10.0)
+PL_TEST_ULP (V_NAME (acosh), 2.53)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (acosh))
+PL_TEST_INTERVAL (V_NAME (acosh), 1, 0x1p511, 90000)
+PL_TEST_INTERVAL (V_NAME (acosh), 0x1p511, inf, 10000)
+PL_TEST_INTERVAL (V_NAME (acosh), 0, 1, 1000)
+PL_TEST_INTERVAL (V_NAME (acosh), -0, -inf, 10000)
+#endif
diff --git a/pl/math/v_acoshf_3u1.c b/pl/math/v_acoshf_3u1.c
new file mode 100644
index 0000000..2b5aff5
--- /dev/null
+++ b/pl/math/v_acoshf_3u1.c
@@ -0,0 +1,68 @@
+/*
+ * Single-precision vector acosh(x) function.
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define SignMask 0x80000000
+#define One 0x3f800000
+#define SquareLim 0x5f800000 /* asuint(0x1p64).  */
+
+#if V_SUPPORTED
+
+#include "v_log1pf_inline.h"
+
+static NOINLINE VPCS_ATTR v_f32_t
+special_case (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+  return v_call_f32 (acoshf, x, y, special);
+}
+
+/* Vector approximation for single-precision acosh, based on log1p. Maximum
+   error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it
+   is 2.78 ULP:
+   __v_acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3
+			   want 0x1.ef9ea2p-3.
+   With exceptions disabled, we can compute u with a shorter dependency chain,
+   which gives maximum error of 3.07 ULP:
+  __v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4
+			   want 0x1.fbc7f4p-4.  */
+
+VPCS_ATTR v_f32_t V_NAME (acoshf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t special = v_cond_u32 ((ix - One) >= (SquareLim - One));
+
+#if WANT_SIMD_EXCEPT
+  /* Mask special lanes with 1 to side-step spurious invalid or overflow. Use
+     only xm1 to calculate u, as operating on x will trigger invalid for NaN. */
+  v_f32_t xm1 = v_sel_f32 (special, v_f32 (1), x - 1);
+  v_f32_t u = v_fma_f32 (xm1, xm1, 2 * xm1);
+#else
+  v_f32_t xm1 = x - 1;
+  v_f32_t u = xm1 * (x + 1.0f);
+#endif
+  v_f32_t y = log1pf_inline (xm1 + v_sqrt_f32 (u));
+
+  if (unlikely (v_any_u32 (special)))
+    return special_case (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, acosh, 1.0, 10.0)
+#if WANT_SIMD_EXCEPT
+PL_TEST_ULP (V_NAME (acoshf), 2.29)
+#else
+PL_TEST_ULP (V_NAME (acoshf), 2.58)
+#endif
+PL_TEST_EXPECT_FENV (V_NAME (acoshf), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (acoshf), 0, 1, 500)
+PL_TEST_INTERVAL (V_NAME (acoshf), 1, SquareLim, 100000)
+PL_TEST_INTERVAL (V_NAME (acoshf), SquareLim, inf, 1000)
+PL_TEST_INTERVAL (V_NAME (acoshf), -0, -inf, 1000)
+#endif
diff --git a/pl/math/v_asinh_3u5.c b/pl/math/v_asinh_3u5.c
new file mode 100644
index 0000000..fd329b6
--- /dev/null
+++ b/pl/math/v_asinh_3u5.c
@@ -0,0 +1,175 @@
+/*
+ * Double-precision vector asinh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "estrin.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define OneTop 0x3ff	/* top12(asuint64(1.0f)).  */
+#define HugeBound 0x5fe /* top12(asuint64(0x1p511)).  */
+#define TinyBound 0x3e5 /* top12(asuint64(0x1p-26)).  */
+#define AbsMask v_u64 (0x7fffffffffffffff)
+#define C(i) v_f64 (__asinh_data.poly[i])
+
+/* Constants & data for log.  */
+#define OFF 0x3fe6000000000000
+#define Ln2 v_f64 (0x1.62e42fefa39efp-1)
+#define A(i) v_f64 (__sv_log_data.poly[i])
+#define T(i) __log_data.tab[i]
+#define N (1 << LOG_TABLE_BITS)
+
+static NOINLINE v_f64_t
+special_case (v_f64_t x, v_f64_t y, v_u64_t special)
+{
+  return v_call_f64 (asinh, x, y, special);
+}
+
+struct entry
+{
+  v_f64_t invc;
+  v_f64_t logc;
+};
+
+static inline struct entry
+lookup (v_u64_t i)
+{
+  struct entry e;
+#ifdef SCALAR
+  e.invc = T (i).invc;
+  e.logc = T (i).logc;
+#else
+  e.invc[0] = T (i[0]).invc;
+  e.logc[0] = T (i[0]).logc;
+  e.invc[1] = T (i[1]).invc;
+  e.logc[1] = T (i[1]).logc;
+#endif
+  return e;
+}
+
+static inline v_f64_t
+log_inline (v_f64_t x)
+{
+  /* Double-precision vector log, copied from math/v_log.c with some cosmetic
+     modification and special-cases removed. See that file for details of the
+     algorithm used.  */
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t tmp = ix - OFF;
+  v_u64_t i = (tmp >> (52 - LOG_TABLE_BITS)) % N;
+  v_s64_t k = v_as_s64_u64 (tmp) >> 52;
+  v_u64_t iz = ix - (tmp & 0xfffULL << 52);
+  v_f64_t z = v_as_f64_u64 (iz);
+  struct entry e = lookup (i);
+  v_f64_t r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
+  v_f64_t kd = v_to_f64_s64 (k);
+  v_f64_t hi = v_fma_f64 (kd, Ln2, e.logc + r);
+  v_f64_t r2 = r * r;
+  v_f64_t y = v_fma_f64 (A (3), r, A (2));
+  v_f64_t p = v_fma_f64 (A (1), r, A (0));
+  y = v_fma_f64 (A (4), r2, y);
+  y = v_fma_f64 (y, r2, p);
+  y = v_fma_f64 (y, r2, hi);
+  return y;
+}
+
+/* Double-precision implementation of vector asinh(x).
+   asinh is very sensitive around 1, so it is impractical to devise a single
+   low-cost algorithm which is sufficiently accurate on a wide range of input.
+   Instead we use two different algorithms:
+   asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1)      if |x| >= 1
+	    = sign(x) * (|x| + |x|^3 * P(x^2))       otherwise
+   where log(x) is an optimized log approximation, and P(x) is a polynomial
+   shared with the scalar routine. The greatest observed error 3.29 ULP, in
+   |x| >= 1:
+   __v_asinh(0x1.2cd9d717e2c9bp+0) got 0x1.ffffcfd0e234fp-1
+				  want 0x1.ffffcfd0e2352p-1.  */
+VPCS_ATTR v_f64_t V_NAME (asinh) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t iax = ix & AbsMask;
+  v_f64_t ax = v_as_f64_u64 (iax);
+  v_u64_t top12 = iax >> 52;
+
+  v_u64_t gt1 = v_cond_u64 (top12 >= OneTop);
+  v_u64_t special = v_cond_u64 (top12 >= HugeBound);
+
+#if WANT_SIMD_EXCEPT
+  v_u64_t tiny = v_cond_u64 (top12 < TinyBound);
+  special |= tiny;
+#endif
+
+  /* Option 1: |x| >= 1.
+     Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)).
+     If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will
+     overflow, by setting special lanes to 1. These will be fixed later.  */
+  v_f64_t option_1 = v_f64 (0);
+  if (likely (v_any_u64 (gt1)))
+    {
+#if WANT_SIMD_EXCEPT
+      v_f64_t xm = v_sel_f64 (special, v_f64 (1), ax);
+#else
+      v_f64_t xm = ax;
+#endif
+      option_1 = log_inline (xm + v_sqrt_f64 (xm * xm + 1));
+    }
+
+  /* Option 2: |x| < 1.
+     Compute asinh(x) using a polynomial.
+     If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will
+     overflow, and tiny lanes, which will underflow, by setting them to 0. They
+     will be fixed later, either by selecting x or falling back to the scalar
+     special-case. The largest observed error in this region is 1.47 ULPs:
+     __v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
+				    want 0x1.c1d6bf874019cp-1.  */
+  v_f64_t option_2 = v_f64 (0);
+  if (likely (v_any_u64 (~gt1)))
+    {
+#if WANT_SIMD_EXCEPT
+      ax = v_sel_f64 (tiny | gt1, v_f64 (0), ax);
+#endif
+      v_f64_t x2 = ax * ax;
+      v_f64_t z2 = x2 * x2;
+      v_f64_t z4 = z2 * z2;
+      v_f64_t z8 = z4 * z4;
+      v_f64_t p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C);
+      option_2 = v_fma_f64 (p, x2 * ax, ax);
+#if WANT_SIMD_EXCEPT
+      option_2 = v_sel_f64 (tiny, x, option_2);
+#endif
+    }
+
+  /* Choose the right option for each lane.  */
+  v_f64_t y = v_sel_f64 (gt1, option_1, option_2);
+  /* Copy sign.  */
+  y = v_as_f64_u64 (v_bsl_u64 (AbsMask, v_as_u64_f64 (y), ix));
+
+  if (unlikely (v_any_u64 (special)))
+    return special_case (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, asinh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (asinh), 2.80)
+PL_TEST_EXPECT_FENV (V_NAME (asinh), WANT_SIMD_EXCEPT)
+/* Test vector asinh 3 times, with control lane < 1, > 1 and special.
+   Ensures the v_sel is choosing the right option in all cases.  */
+#define V_ASINH_INTERVAL(lo, hi, n)                                            \
+  PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 0.5)                          \
+  PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 2)                            \
+  PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 0x1p600)
+V_ASINH_INTERVAL (0, 0x1p-26, 50000)
+V_ASINH_INTERVAL (0x1p-26, 1, 50000)
+V_ASINH_INTERVAL (1, 0x1p511, 50000)
+V_ASINH_INTERVAL (0x1p511, inf, 40000)
+V_ASINH_INTERVAL (-0, -0x1p-26, 50000)
+V_ASINH_INTERVAL (-0x1p-26, -1, 50000)
+V_ASINH_INTERVAL (-1, -0x1p511, 50000)
+V_ASINH_INTERVAL (-0x1p511, -inf, 40000)
+#endif
diff --git a/pl/math/v_asinhf_2u7.c b/pl/math/v_asinhf_2u7.c
new file mode 100644
index 0000000..9d8c8a9
--- /dev/null
+++ b/pl/math/v_asinhf_2u7.c
@@ -0,0 +1,70 @@
+/*
+ * Single-precision vector asinh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "include/mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define SignMask v_u32 (0x80000000)
+#define One v_f32 (1.0f)
+#define BigBound v_u32 (0x5f800000)  /* asuint(0x1p64).  */
+#define TinyBound v_u32 (0x30800000) /* asuint(0x1p-30).  */
+
+#include "v_log1pf_inline.h"
+
+static NOINLINE v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+  return v_call_f32 (asinhf, x, y, special);
+}
+
+/* Single-precision implementation of vector asinh(x), using vector log1p.
+   Worst-case error is 2.66 ULP, at roughly +/-0.25:
+   __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3.  */
+VPCS_ATTR v_f32_t V_NAME (asinhf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iax = ix & ~SignMask;
+  v_u32_t sign = ix & SignMask;
+  v_f32_t ax = v_as_f32_u32 (iax);
+  v_u32_t special = v_cond_u32 (iax >= BigBound);
+
+#if WANT_SIMD_EXCEPT
+  /* Sidestep tiny and large values to avoid inadvertently triggering
+     under/overflow.  */
+  special |= v_cond_u32 (iax < TinyBound);
+  if (unlikely (v_any_u32 (special)))
+    ax = v_sel_f32 (special, One, ax);
+#endif
+
+  /* asinh(x) = log(x + sqrt(x * x + 1)).
+     For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))).  */
+  v_f32_t d = One + v_sqrt_f32 (ax * ax + One);
+  v_f32_t y = log1pf_inline (ax + ax * ax / d);
+  y = v_as_f32_u32 (sign | v_as_u32_f32 (y));
+
+  if (unlikely (v_any_u32 (special)))
+    return specialcase (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, asinh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (asinhf), 2.17)
+PL_TEST_EXPECT_FENV (V_NAME (asinhf), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (asinhf), 0, 0x1p-12, 40000)
+PL_TEST_INTERVAL (V_NAME (asinhf), 0x1p-12, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME (asinhf), 1.0, 0x1p11, 40000)
+PL_TEST_INTERVAL (V_NAME (asinhf), 0x1p11, inf, 40000)
+PL_TEST_INTERVAL (V_NAME (asinhf), 0, -0x1p-12, 20000)
+PL_TEST_INTERVAL (V_NAME (asinhf), -0x1p-12, -1.0, 20000)
+PL_TEST_INTERVAL (V_NAME (asinhf), -1.0, -0x1p11, 20000)
+PL_TEST_INTERVAL (V_NAME (asinhf), -0x1p11, -inf, 20000)
+#endif
diff --git a/pl/math/v_atan2_3u.c b/pl/math/v_atan2_3u.c
new file mode 100644
index 0000000..6327fea
--- /dev/null
+++ b/pl/math/v_atan2_3u.c
@@ -0,0 +1,90 @@
+/*
+ * Double-precision vector atan2(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#include "atan_common.h"
+
+#define PiOver2 v_f64 (0x1.921fb54442d18p+0)
+#define SignMask v_u64 (0x8000000000000000)
+
+/* Special cases i.e. 0, infinity, NaN (fall back to scalar calls).  */
+VPCS_ATTR
+NOINLINE static v_f64_t
+specialcase (v_f64_t y, v_f64_t x, v_f64_t ret, v_u64_t cmp)
+{
+  return v_call2_f64 (atan2, y, x, ret, cmp);
+}
+
+/* Returns 1 if input is the bit representation of 0, infinity or nan.  */
+static inline v_u64_t
+zeroinfnan (v_u64_t i)
+{
+  return v_cond_u64 (2 * i - 1 >= v_u64 (2 * asuint64 (INFINITY) - 1));
+}
+
+/* Fast implementation of vector atan2.
+   Maximum observed error is 2.8 ulps:
+   v_atan2(0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5)
+	got 0x1.92d628ab678ccp-1
+       want 0x1.92d628ab678cfp-1.  */
+VPCS_ATTR
+v_f64_t V_NAME (atan2) (v_f64_t y, v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t iy = v_as_u64_f64 (y);
+
+  v_u64_t special_cases = zeroinfnan (ix) | zeroinfnan (iy);
+
+  v_u64_t sign_x = ix & SignMask;
+  v_u64_t sign_y = iy & SignMask;
+  v_u64_t sign_xy = sign_x ^ sign_y;
+
+  v_f64_t ax = v_abs_f64 (x);
+  v_f64_t ay = v_abs_f64 (y);
+
+  v_u64_t pred_xlt0 = x < 0.0;
+  v_u64_t pred_aygtax = ay > ax;
+
+  /* Set up z for call to atan.  */
+  v_f64_t n = v_sel_f64 (pred_aygtax, -ax, ay);
+  v_f64_t d = v_sel_f64 (pred_aygtax, ay, ax);
+  v_f64_t z = v_div_f64 (n, d);
+
+  /* Work out the correct shift.  */
+  v_f64_t shift = v_sel_f64 (pred_xlt0, v_f64 (-2.0), v_f64 (0.0));
+  shift = v_sel_f64 (pred_aygtax, shift + 1.0, shift);
+  shift *= PiOver2;
+
+  v_f64_t ret = eval_poly (z, z, shift);
+
+  /* Account for the sign of x and y.  */
+  ret = v_as_f64_u64 (v_as_u64_f64 (ret) ^ sign_xy);
+
+  if (unlikely (v_any_u64 (special_cases)))
+    {
+      return specialcase (y, x, ret, special_cases);
+    }
+
+  return ret;
+}
+VPCS_ALIAS
+
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
+PL_SIG (V, D, 2, atan2)
+// TODO tighten this once __v_atan2 is fixed
+PL_TEST_ULP (V_NAME (atan2), 2.9)
+PL_TEST_INTERVAL (V_NAME (atan2), -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (V_NAME (atan2), -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atan2), 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atan2), 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atan2), 1e6, 1e32, 40000)
+#endif
diff --git a/pl/math/v_atan2f_3u.c b/pl/math/v_atan2f_3u.c
new file mode 100644
index 0000000..5d1e6ca
--- /dev/null
+++ b/pl/math/v_atan2f_3u.c
@@ -0,0 +1,89 @@
+/*
+ * Single-precision vector atan2(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#include "atanf_common.h"
+
+/* Useful constants.  */
+#define PiOver2 v_f32 (0x1.921fb6p+0f)
+#define SignMask v_u32 (0x80000000)
+
+/* Special cases i.e. 0, infinity and nan (fall back to scalar calls).  */
+VPCS_ATTR
+NOINLINE static v_f32_t
+specialcase (v_f32_t y, v_f32_t x, v_f32_t ret, v_u32_t cmp)
+{
+  return v_call2_f32 (atan2f, y, x, ret, cmp);
+}
+
+/* Returns 1 if input is the bit representation of 0, infinity or nan.  */
+static inline v_u32_t
+zeroinfnan (v_u32_t i)
+{
+  return v_cond_u32 (2 * i - 1 >= v_u32 (2 * 0x7f800000lu - 1));
+}
+
+/* Fast implementation of vector atan2f. Maximum observed error is
+   2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]:
+   v_atan2(0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
+				       want 0x1.967f00p-1.  */
+VPCS_ATTR
+v_f32_t V_NAME (atan2f) (v_f32_t y, v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iy = v_as_u32_f32 (y);
+
+  v_u32_t special_cases = zeroinfnan (ix) | zeroinfnan (iy);
+
+  v_u32_t sign_x = ix & SignMask;
+  v_u32_t sign_y = iy & SignMask;
+  v_u32_t sign_xy = sign_x ^ sign_y;
+
+  v_f32_t ax = v_abs_f32 (x);
+  v_f32_t ay = v_abs_f32 (y);
+
+  v_u32_t pred_xlt0 = x < 0.0f;
+  v_u32_t pred_aygtax = ay > ax;
+
+  /* Set up z for call to atanf.  */
+  v_f32_t n = v_sel_f32 (pred_aygtax, -ax, ay);
+  v_f32_t d = v_sel_f32 (pred_aygtax, ay, ax);
+  v_f32_t z = v_div_f32 (n, d);
+
+  /* Work out the correct shift.  */
+  v_f32_t shift = v_sel_f32 (pred_xlt0, v_f32 (-2.0f), v_f32 (0.0f));
+  shift = v_sel_f32 (pred_aygtax, shift + 1.0f, shift);
+  shift *= PiOver2;
+
+  v_f32_t ret = eval_poly (z, z, shift);
+
+  /* Account for the sign of y.  */
+  ret = v_as_f32_u32 (v_as_u32_f32 (ret) ^ sign_xy);
+
+  if (unlikely (v_any_u32 (special_cases)))
+    {
+      return specialcase (y, x, ret, special_cases);
+    }
+
+  return ret;
+}
+VPCS_ALIAS
+
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h.  */
+PL_SIG (V, F, 2, atan2)
+PL_TEST_ULP (V_NAME (atan2f), 2.46)
+PL_TEST_INTERVAL (V_NAME (atan2f), -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (V_NAME (atan2f), -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atan2f), 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atan2f), 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (V_NAME (atan2f), 1e6, 1e32, 40000)
+#endif
diff --git a/pl/math/v_atan_2u5.c b/pl/math/v_atan_2u5.c
new file mode 100644
index 0000000..0f3c2cc
--- /dev/null
+++ b/pl/math/v_atan_2u5.c
@@ -0,0 +1,74 @@
+/*
+ * Double-precision vector atan(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#include "atan_common.h"
+
+#define PiOver2 v_f64 (0x1.921fb54442d18p+0)
+#define AbsMask v_u64 (0x7fffffffffffffff)
+#define TinyBound 0x3e1 /* top12(asuint64(0x1p-30)).  */
+#define BigBound 0x434	/* top12(asuint64(0x1p53)).  */
+
+/* Fast implementation of vector atan.
+   Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
+   z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps:
+   __v_atan(0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
+				 want 0x1.9225645bdd7c3p-1.  */
+VPCS_ATTR
+v_f64_t V_NAME (atan) (v_f64_t x)
+{
+  /* Small cases, infs and nans are supported by our approximation technique,
+     but do not set fenv flags correctly. Only trigger special case if we need
+     fenv.  */
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t sign = ix & ~AbsMask;
+
+#if WANT_SIMD_EXCEPT
+  v_u64_t ia12 = (ix >> 52) & 0x7ff;
+  v_u64_t special = v_cond_u64 (ia12 - TinyBound > BigBound - TinyBound);
+  /* If any lane is special, fall back to the scalar routine for all lanes.  */
+  if (unlikely (v_any_u64 (special)))
+    return v_call_f64 (atan, x, v_f64 (0), v_u64 (-1));
+#endif
+
+  /* Argument reduction:
+     y := arctan(x) for x < 1
+     y := pi/2 + arctan(-1/x) for x > 1
+     Hence, use z=-1/a if x>=1, otherwise z=a.  */
+  v_u64_t red = v_cagt_f64 (x, v_f64 (1.0));
+  /* Avoid dependency in abs(x) in division (and comparison).  */
+  v_f64_t z = v_sel_f64 (red, v_div_f64 (v_f64 (-1.0), x), x);
+  v_f64_t shift = v_sel_f64 (red, PiOver2, v_f64 (0.0));
+  /* Use absolute value only when needed (odd powers of z).  */
+  v_f64_t az = v_abs_f64 (z);
+  az = v_sel_f64 (red, -az, az);
+
+  /* Calculate the polynomial approximation.  */
+  v_f64_t y = eval_poly (z, az, shift);
+
+  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
+  y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, atan, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (atan), 1.78)
+PL_TEST_EXPECT_FENV (V_NAME (atan), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (atan), 0, 0x1p-30, 10000)
+PL_TEST_INTERVAL (V_NAME (atan), -0, -0x1p-30, 1000)
+PL_TEST_INTERVAL (V_NAME (atan), 0x1p-30, 0x1p53, 900000)
+PL_TEST_INTERVAL (V_NAME (atan), -0x1p-30, -0x1p53, 90000)
+PL_TEST_INTERVAL (V_NAME (atan), 0x1p53, inf, 10000)
+PL_TEST_INTERVAL (V_NAME (atan), -0x1p53, -inf, 1000)
+
+#endif
diff --git a/pl/math/v_atanf_3u.c b/pl/math/v_atanf_3u.c
new file mode 100644
index 0000000..67d90b9
--- /dev/null
+++ b/pl/math/v_atanf_3u.c
@@ -0,0 +1,83 @@
+/*
+ * Single-precision vector atan(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#include "atanf_common.h"
+
+#define PiOver2 v_f32 (0x1.921fb6p+0f)
+#define AbsMask v_u32 (0x7fffffff)
+#define TinyBound 0x308 /* top12(asuint(0x1p-30)).  */
+#define BigBound 0x4e8	/* top12(asuint(0x1p30)).  */
+
+#if WANT_SIMD_EXCEPT
+static NOINLINE v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+  return v_call_f32 (atanf, x, y, special);
+}
+#endif
+
+/* Fast implementation of vector atanf based on
+   atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
+   using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps:
+   v_atanf(0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1.  */
+VPCS_ATTR
+v_f32_t V_NAME (atanf) (v_f32_t x)
+{
+  /* Small cases, infs and nans are supported by our approximation technique,
+     but do not set fenv flags correctly. Only trigger special case if we need
+     fenv.  */
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t sign = ix & ~AbsMask;
+
+#if WANT_SIMD_EXCEPT
+  v_u32_t ia12 = (ix >> 20) & 0x7ff;
+  v_u32_t special = v_cond_u32 (ia12 - TinyBound > BigBound - TinyBound);
+  /* If any lane is special, fall back to the scalar routine for all lanes.  */
+  if (unlikely (v_any_u32 (special)))
+    return specialcase (x, x, v_u32 (-1));
+#endif
+
+  /* Argument reduction:
+     y := arctan(x) for x < 1
+     y := pi/2 + arctan(-1/x) for x > 1
+     Hence, use z=-1/a if x>=1, otherwise z=a.  */
+  v_u32_t red = v_cagt_f32 (x, v_f32 (1.0));
+  /* Avoid dependency in abs(x) in division (and comparison).  */
+  v_f32_t z = v_sel_f32 (red, v_div_f32 (v_f32 (-1.0f), x), x);
+  v_f32_t shift = v_sel_f32 (red, PiOver2, v_f32 (0.0f));
+  /* Use absolute value only when needed (odd powers of z).  */
+  v_f32_t az = v_abs_f32 (z);
+  az = v_sel_f32 (red, -az, az);
+
+  /* Calculate the polynomial approximation.  */
+  v_f32_t y = eval_poly (z, az, shift);
+
+  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
+  y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign);
+
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, atan, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (atanf), 2.5)
+PL_TEST_EXPECT_FENV (V_NAME (atanf), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (atanf), 0, 0x1p-30, 5000)
+PL_TEST_INTERVAL (V_NAME (atanf), -0, -0x1p-30, 5000)
+PL_TEST_INTERVAL (V_NAME (atanf), 0x1p-30, 1, 40000)
+PL_TEST_INTERVAL (V_NAME (atanf), -0x1p-30, -1, 40000)
+PL_TEST_INTERVAL (V_NAME (atanf), 1, 0x1p30, 40000)
+PL_TEST_INTERVAL (V_NAME (atanf), -1, -0x1p30, 40000)
+PL_TEST_INTERVAL (V_NAME (atanf), 0x1p30, inf, 1000)
+PL_TEST_INTERVAL (V_NAME (atanf), -0x1p30, -inf, 1000)
+#endif
diff --git a/pl/math/v_atanh_3u5.c b/pl/math/v_atanh_3u5.c
new file mode 100644
index 0000000..bfaf5c2
--- /dev/null
+++ b/pl/math/v_atanh_3u5.c
@@ -0,0 +1,61 @@
+/*
+ * Double-precision vector atanh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pairwise_horner.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define WANT_V_LOG1P_K0_SHORTCUT 0
+#include "v_log1p_inline.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define Half 0x3fe0000000000000
+#define One 0x3ff0000000000000
+
+VPCS_ATTR
+NOINLINE static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t special)
+{
+  return v_call_f64 (atanh, x, y, special);
+}
+
+/* Approximation for vector double-precision atanh(x) using modified log1p.
+   The greatest observed error is 3.31 ULP:
+   __v_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6
+				 want 0x1.ffd8ff31b501cp-6.  */
+VPCS_ATTR
+v_f64_t V_NAME (atanh) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t sign = ix & ~AbsMask;
+  v_u64_t ia = ix & AbsMask;
+  v_u64_t special = v_cond_u64 (ia >= One);
+  v_f64_t halfsign = v_as_f64_u64 (sign | Half);
+
+  /* Mask special lanes with 0 to prevent spurious underflow.  */
+  v_f64_t ax = v_sel_f64 (special, v_f64 (0), v_as_f64_u64 (ia));
+  v_f64_t y = halfsign * log1p_inline ((2 * ax) / (1 - ax));
+
+  if (unlikely (v_any_u64 (special)))
+    return specialcase (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, atanh, -1.0, 1.0)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (atanh))
+PL_TEST_ULP (V_NAME (atanh), 3.32)
+PL_TEST_INTERVAL_C (V_NAME (atanh), 0, 0x1p-23, 10000, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanh), -0, -0x1p-23, 10000, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanh), 0x1p-23, 1, 90000, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanh), -0x1p-23, -1, 90000, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanh), 1, inf, 100, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanh), -1, -inf, 100, 0)
+#endif
diff --git a/pl/math/v_atanhf_3u1.c b/pl/math/v_atanhf_3u1.c
new file mode 100644
index 0000000..cd30696
--- /dev/null
+++ b/pl/math/v_atanhf_3u1.c
@@ -0,0 +1,62 @@
+/*
+ * Single-precision vector atanh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#include "v_log1pf_inline.h"
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define One 0x3f800000
+#define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */
+
+/* Approximation for vector single-precision atanh(x) using modified log1p.
+   The maximum error is 3.08 ULP:
+   __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5
+			   want 0x1.ffcb82p-5.  */
+VPCS_ATTR v_f32_t V_NAME (atanhf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_f32_t halfsign
+    = v_as_f32_u32 (v_bsl_u32 (v_u32 (AbsMask), v_u32 (Half), ix));
+  v_u32_t iax = ix & AbsMask;
+
+  v_f32_t ax = v_as_f32_u32 (iax);
+
+#if WANT_SIMD_EXCEPT
+  v_u32_t special = v_cond_u32 ((iax >= One) | (iax <= TinyBound));
+  /* Side-step special cases by setting those lanes to 0, which will trigger no
+     exceptions. These will be fixed up later.  */
+  if (unlikely (v_any_u32 (special)))
+    ax = v_sel_f32 (special, v_f32 (0), ax);
+#else
+  v_u32_t special = v_cond_u32 (iax >= One);
+#endif
+
+  v_f32_t y = halfsign * log1pf_inline ((2 * ax) / (1 - ax));
+
+  if (unlikely (v_any_u32 (special)))
+    return v_call_f32 (atanhf, x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, atanh, -1.0, 1.0)
+PL_TEST_ULP (V_NAME (atanhf), 2.59)
+PL_TEST_EXPECT_FENV (V_NAME (atanhf), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL_C (V_NAME (atanhf), 0, 0x1p-12, 500, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanhf), 0x1p-12, 1, 200000, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanhf), 1, inf, 1000, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanhf), -0, -0x1p-12, 500, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanhf), -0x1p-12, -1, 200000, 0)
+PL_TEST_INTERVAL_C (V_NAME (atanhf), -1, -inf, 1000, 0)
+#endif
diff --git a/pl/math/v_cbrt_2u.c b/pl/math/v_cbrt_2u.c
new file mode 100644
index 0000000..d5abe41
--- /dev/null
+++ b/pl/math/v_cbrt_2u.c
@@ -0,0 +1,98 @@
+/*
+ * Double-precision vector cbrt(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define AbsMask 0x7fffffffffffffff
+#define TwoThirds v_f64 (0x1.5555555555555p-1)
+#define TinyBound 0x001 /* top12 (smallest_normal).  */
+#define BigBound 0x7ff	/* top12 (infinity).  */
+#define MantissaMask v_u64 (0x000fffffffffffff)
+#define HalfExp v_u64 (0x3fe0000000000000)
+
+#define C(i) v_f64 (__cbrt_data.poly[i])
+#define T(i) v_lookup_f64 (__cbrt_data.table, i)
+
+static NOINLINE v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t special)
+{
+  return v_call_f64 (cbrt, x, y, special);
+}
+
+/* Approximation for double-precision vector cbrt(x), using low-order polynomial
+   and two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat
+   according to the exponent, for instance an error observed for double value
+   m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an
+   integer.
+   __v_cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0
+				 want 0x1.965fe72821e99p+0.  */
+VPCS_ATTR v_f64_t V_NAME (cbrt) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t iax = ix & AbsMask;
+  v_u64_t ia12 = iax >> 52;
+
+  /* Subnormal, +/-0 and special values.  */
+  v_u64_t special = v_cond_u64 ((ia12 < TinyBound) | (ia12 >= BigBound));
+
+  /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+     version of frexp, which gets subnormal values wrong - these have to be
+     special-cased as a result.  */
+  v_f64_t m = v_as_f64_u64 (v_bsl_u64 (MantissaMask, iax, HalfExp));
+  v_s64_t e = v_as_s64_u64 (iax >> 52) - 1022;
+
+  /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for
+     Newton iterations.  */
+  v_f64_t p_01 = v_fma_f64 (C (1), m, C (0));
+  v_f64_t p_23 = v_fma_f64 (C (3), m, C (2));
+  v_f64_t p = v_fma_f64 (m * m, p_23, p_01);
+
+  /* Two iterations of Newton's method for iteratively approximating cbrt.  */
+  v_f64_t m_by_3 = m / 3;
+  v_f64_t a = v_fma_f64 (TwoThirds, p, m_by_3 / (p * p));
+  a = v_fma_f64 (TwoThirds, a, m_by_3 / (a * a));
+
+  /* Assemble the result by the following:
+
+     cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+     We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+     not necessarily a multiple of 3 we lose some information.
+
+     Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+     Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is
+     an integer in [-2, 2], and can be looked up in the table T. Hence the
+     result is assembled as:
+
+     cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.  */
+
+  v_s64_t ey = e / 3;
+  v_f64_t my = a * T (v_as_u64_s64 (e % 3 + 2));
+
+  /* Vector version of ldexp.  */
+  v_f64_t y = v_as_f64_u64 ((v_as_u64_s64 (ey + 1023) << 52)) * my;
+  /* Copy sign.  */
+  y = v_as_f64_u64 (v_bsl_u64 (v_u64 (AbsMask), v_as_u64_f64 (y), ix));
+
+  if (unlikely (v_any_u64 (special)))
+    return specialcase (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_TEST_ULP (V_NAME (cbrt), 1.30)
+PL_SIG (V, D, 1, cbrt, -10.0, 10.0)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (cbrt))
+PL_TEST_INTERVAL (V_NAME (cbrt), 0, inf, 1000000)
+PL_TEST_INTERVAL (V_NAME (cbrt), -0, -inf, 1000000)
+#endif
diff --git a/pl/math/v_cbrtf_1u5.c b/pl/math/v_cbrtf_1u5.c
new file mode 100644
index 0000000..62fa375
--- /dev/null
+++ b/pl/math/v_cbrtf_1u5.c
@@ -0,0 +1,96 @@
+/*
+ * Single-precision vector cbrt(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define AbsMask 0x7fffffff
+#define SignMask v_u32 (0x80000000)
+#define TwoThirds v_f32 (0x1.555556p-1f)
+#define SmallestNormal 0x00800000
+#define MantissaMask 0x007fffff
+#define HalfExp 0x3f000000
+
+#define C(i) v_f32 (__cbrtf_data.poly[i])
+#define T(i) v_lookup_f32 (__cbrtf_data.table, i)
+
+static NOINLINE v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+  return v_call_f32 (cbrtf, x, y, special);
+}
+
+/* Approximation for vector single-precision cbrt(x) using Newton iteration with
+   initial guess obtained by a low-order polynomial. Greatest error is 1.5 ULP.
+   This is observed for every value where the mantissa is 0x1.81410e and the
+   exponent is a multiple of 3, for example:
+   __v_cbrtf(0x1.81410ep+30) got 0x1.255d96p+10
+			    want 0x1.255d92p+10.  */
+VPCS_ATTR v_f32_t V_NAME (cbrtf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iax = ix & AbsMask;
+
+  /* Subnormal, +/-0 and special values.  */
+  v_u32_t special = v_cond_u32 ((iax < SmallestNormal) | (iax >= 0x7f800000));
+
+  /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+     version of frexpf, which gets subnormal values wrong - these have to be
+     special-cased as a result.  */
+  v_f32_t m = v_as_f32_u32 ((iax & MantissaMask) | HalfExp);
+  v_s32_t e = v_as_s32_u32 (iax >> 23) - 126;
+
+  /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
+     the less accurate the next stage of the algorithm needs to be. An order-4
+     polynomial is enough for one Newton iteration.  */
+  v_f32_t p_01 = v_fma_f32 (C (1), m, C (0));
+  v_f32_t p_23 = v_fma_f32 (C (3), m, C (2));
+  v_f32_t p = v_fma_f32 (m * m, p_23, p_01);
+
+  /* One iteration of Newton's method for iteratively approximating cbrt.  */
+  v_f32_t m_by_3 = m / 3;
+  v_f32_t a = v_fma_f32 (TwoThirds, p, m_by_3 / (p * p));
+
+  /* Assemble the result by the following:
+
+     cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+     We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+     not necessarily a multiple of 3 we lose some information.
+
+     Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+     Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is
+     an integer in [-2, 2], and can be looked up in the table T. Hence the
+     result is assembled as:
+
+     cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.  */
+
+  v_s32_t ey = e / 3;
+  v_f32_t my = a * T (v_as_u32_s32 (e % 3 + 2));
+
+  /* Vector version of ldexpf.  */
+  v_f32_t y = v_as_f32_u32 ((v_as_u32_s32 (ey + 127) << 23)) * my;
+  /* Copy sign.  */
+  y = v_as_f32_u32 (v_bsl_u32 (SignMask, ix, v_as_u32_f32 (y)));
+
+  if (unlikely (v_any_u32 (special)))
+    return specialcase (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, cbrt, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (cbrtf), 1.03)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (cbrtf))
+PL_TEST_INTERVAL (V_NAME (cbrtf), 0, inf, 1000000)
+PL_TEST_INTERVAL (V_NAME (cbrtf), -0, -inf, 1000000)
+#endif
diff --git a/pl/math/v_cosh_2u.c b/pl/math/v_cosh_2u.c
new file mode 100644
index 0000000..0a9fbf8
--- /dev/null
+++ b/pl/math/v_cosh_2u.c
@@ -0,0 +1,96 @@
+/*
+ * Double-precision vector cosh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+#include "v_exp_tail.h"
+
+#define C1 v_f64 (C1_scal)
+#define C2 v_f64 (C2_scal)
+#define C3 v_f64 (C3_scal)
+#define InvLn2 v_f64 (InvLn2_scal)
+#define Ln2hi v_f64 (Ln2hi_scal)
+#define Ln2lo v_f64 (Ln2lo_scal)
+#define IndexMask v_u64 (IndexMask_scal)
+#define Shift v_f64 (Shift_scal)
+#define Thres v_f64 (Thres_scal)
+
+#define AbsMask 0x7fffffffffffffff
+#define Half v_f64 (0.5)
+#define SpecialBound                                                           \
+  0x4086000000000000 /* 0x1.6p9, above which exp overflows.  */
+
+#if V_SUPPORTED
+
+static inline v_f64_t
+exp_inline (v_f64_t x)
+{
+  /* Helper for approximating exp(x). Copied from v_exp_tail, with no
+     special-case handling or tail.  */
+
+  /* n = round(x/(ln2/N)).  */
+  v_f64_t z = v_fma_f64 (x, InvLn2, Shift);
+  v_u64_t u = v_as_u64_f64 (z);
+  v_f64_t n = z - Shift;
+
+  /* r = x - n*ln2/N.  */
+  v_f64_t r = x;
+  r = v_fma_f64 (-Ln2hi, n, r);
+  r = v_fma_f64 (-Ln2lo, n, r);
+
+  v_u64_t e = u << (52 - V_EXP_TAIL_TABLE_BITS);
+  v_u64_t i = u & IndexMask;
+
+  /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4.  */
+  v_f64_t y = v_fma_f64 (C3, r, C2);
+  y = v_fma_f64 (y, r, C1);
+  y = v_fma_f64 (y, r, v_f64 (1)) * r;
+
+  /* s = 2^(n/N).  */
+  u = v_lookup_u64 (Tab, i);
+  v_f64_t s = v_as_f64_u64 (u + e);
+
+  return v_fma_f64 (y, s, s);
+}
+
+/* Approximation for vector double-precision cosh(x) using exp_inline.
+   cosh(x) = (exp(x) + exp(-x)) / 2.
+   The greatest observed error is in the scalar fall-back region, so is the same
+   as the scalar routine, 1.93 ULP:
+   __v_cosh(0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021
+				 want 0x1.fdf28623ef923p+1021.
+
+   The greatest observed error in the non-special region is 1.54 ULP:
+   __v_cosh(0x1.8e205b6ecacf7p+2) got 0x1.f711dcb0c77afp+7
+				 want 0x1.f711dcb0c77b1p+7.  */
+VPCS_ATTR v_f64_t V_NAME (cosh) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t iax = ix & AbsMask;
+  v_u64_t special = v_cond_u64 (iax > SpecialBound);
+
+  /* If any inputs are special, fall back to scalar for all lanes.  */
+  if (unlikely (v_any_u64 (special)))
+    return v_call_f64 (cosh, x, x, v_u64 (-1));
+
+  v_f64_t ax = v_as_f64_u64 (iax);
+  /* Up to the point that exp overflows, we can use it to calculate cosh by
+     exp(|x|) / 2 + 1 / (2 * exp(|x|)).  */
+  v_f64_t t = exp_inline (ax);
+  return t * Half + Half / t;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, cosh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (cosh), 1.43)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (cosh))
+PL_TEST_INTERVAL (V_NAME (cosh), 0, 0x1.6p9, 100000)
+PL_TEST_INTERVAL (V_NAME (cosh), -0, -0x1.6p9, 100000)
+PL_TEST_INTERVAL (V_NAME (cosh), 0x1.6p9, inf, 1000)
+PL_TEST_INTERVAL (V_NAME (cosh), -0x1.6p9, -inf, 1000)
+#endif
diff --git a/pl/math/v_coshf_2u4.c b/pl/math/v_coshf_2u4.c
new file mode 100644
index 0000000..1422d4d
--- /dev/null
+++ b/pl/math/v_coshf_2u4.c
@@ -0,0 +1,74 @@
+/*
+ * Single-precision vector cosh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask 0x7fffffff
+#define TinyBound 0x20000000 /* 0x1p-63: Round to 1 below this.  */
+#define SpecialBound                                                           \
+  0x42ad496c /* 0x1.5a92d8p+6: expf overflows above this, so have to use       \
+		special case.  */
+#define Half v_f32 (0.5)
+
+#if V_SUPPORTED
+
+v_f32_t V_NAME (expf) (v_f32_t);
+
+/* Single-precision vector cosh, using vector expf.
+   Maximum error is 2.38 ULP:
+   __v_coshf(0x1.e8001ep+1) got 0x1.6a491ep+4 want 0x1.6a4922p+4.  */
+VPCS_ATTR v_f32_t V_NAME (coshf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iax = ix & AbsMask;
+  v_f32_t ax = v_as_f32_u32 (iax);
+  v_u32_t special = v_cond_u32 (iax >= SpecialBound);
+
+#if WANT_SIMD_EXCEPT
+  /* If fp exceptions are to be triggered correctly, fall back to the scalar
+     variant for all inputs if any input is a special value or above the bound
+     at which expf overflows. */
+  if (unlikely (v_any_u32 (special)))
+    return v_call_f32 (coshf, x, x, v_u32 (-1));
+
+  v_u32_t tiny = v_cond_u32 (iax <= TinyBound);
+  /* If any input is tiny, avoid underflow exception by fixing tiny lanes of
+     input to 1, which will generate no exceptions, and then also fixing tiny
+     lanes of output to 1 just before return.  */
+  if (unlikely (v_any_u32 (tiny)))
+    ax = v_sel_f32 (tiny, v_f32 (1), ax);
+#endif
+
+  /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.  */
+  v_f32_t t = V_NAME (expf) (ax);
+  v_f32_t y = t * Half + Half / t;
+
+#if WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u32 (tiny)))
+    return v_sel_f32 (tiny, v_f32 (1), y);
+#else
+  if (unlikely (v_any_u32 (special)))
+    return v_call_f32 (coshf, x, y, special);
+#endif
+
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, cosh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (coshf), 1.89)
+PL_TEST_EXPECT_FENV (V_NAME (coshf), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (coshf), 0, 0x1p-63, 100)
+PL_TEST_INTERVAL (V_NAME (coshf), 0, 0x1.5a92d8p+6, 80000)
+PL_TEST_INTERVAL (V_NAME (coshf), 0x1.5a92d8p+6, inf, 2000)
+PL_TEST_INTERVAL (V_NAME (coshf), -0, -0x1p-63, 100)
+PL_TEST_INTERVAL (V_NAME (coshf), -0, -0x1.5a92d8p+6, 80000)
+PL_TEST_INTERVAL (V_NAME (coshf), -0x1.5a92d8p+6, -inf, 2000)
+#endif
diff --git a/pl/math/v_erf_2u.c b/pl/math/v_erf_2u.c
new file mode 100644
index 0000000..1d7ddbb
--- /dev/null
+++ b/pl/math/v_erf_2u.c
@@ -0,0 +1,116 @@
+/*
+ * Double-precision vector erf(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "include/mathlib.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define AbsMask v_u64 (0x7fffffffffffffff)
+#define AbsXMax v_f64 (0x1.8p+2)
+#define Scale v_f64 (0x1p+3)
+
+/* Special cases (fall back to scalar calls).  */
+VPCS_ATTR
+NOINLINE static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+{
+  return v_call_f64 (erf, x, y, cmp);
+}
+
+/* A structure to perform look-up in coeffs and other parameter tables.  */
+struct entry
+{
+  v_f64_t P[V_ERF_NCOEFFS];
+  v_f64_t shift;
+};
+
+static inline struct entry
+lookup (v_u64_t i)
+{
+  struct entry e;
+#ifdef SCALAR
+  for (int j = 0; j < V_ERF_NCOEFFS; ++j)
+    e.P[j] = __v_erf_data.coeffs[j][i];
+  e.shift = __v_erf_data.shifts[i];
+#else
+  for (int j = 0; j < V_ERF_NCOEFFS; ++j)
+    {
+      e.P[j][0] = __v_erf_data.coeffs[j][i[0]];
+      e.P[j][1] = __v_erf_data.coeffs[j][i[1]];
+    }
+  e.shift[0] = __v_erf_data.shifts[i[0]];
+  e.shift[1] = __v_erf_data.shifts[i[1]];
+#endif
+  return e;
+}
+
+/* Optimized double precision vector error function erf. Maximum
+   observed error is 1.75 ULP, in [0.110, 0.111]:
+   verf(0x1.c5e0c2d5d0543p-4) got 0x1.fe0ed62a54987p-4
+			     want 0x1.fe0ed62a54985p-4.  */
+VPCS_ATTR
+v_f64_t V_NAME (erf) (v_f64_t x)
+{
+  /* Handle both inf/nan as well as small values (|x|<2^-28)
+     If any condition in the lane is true then a loop over
+     scalar calls will be performed.  */
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t atop = (ix >> 48) & v_u64 (0x7fff);
+  v_u64_t special_case
+    = v_cond_u64 (atop - v_u64 (0x3e30) >= v_u64 (0x7ff0 - 0x3e30));
+
+  /* Get sign and absolute value.  */
+  v_u64_t sign = v_as_u64_f64 (x) & ~AbsMask;
+  v_f64_t a = v_min_f64 (v_abs_f64 (x), AbsXMax);
+
+  /* Compute index by truncating 8 * a with a=|x| saturated to 6.0.  */
+
+#ifdef SCALAR
+  v_u64_t i = v_trunc_u64 (a * Scale);
+#else
+  v_u64_t i = vcvtq_n_u64_f64 (a, 3);
+#endif
+  /* Get polynomial coefficients and shift parameter using lookup.  */
+  struct entry dat = lookup (i);
+
+  /* Evaluate polynomial on transformed argument.  */
+  v_f64_t z = v_fma_f64 (a, Scale, dat.shift);
+
+  v_f64_t r1 = v_fma_f64 (z, dat.P[1], dat.P[0]);
+  v_f64_t r2 = v_fma_f64 (z, dat.P[3], dat.P[2]);
+  v_f64_t r3 = v_fma_f64 (z, dat.P[5], dat.P[4]);
+  v_f64_t r4 = v_fma_f64 (z, dat.P[7], dat.P[6]);
+  v_f64_t r5 = v_fma_f64 (z, dat.P[9], dat.P[8]);
+
+  v_f64_t z2 = z * z;
+  v_f64_t y = v_fma_f64 (z2, r5, r4);
+  y = v_fma_f64 (z2, y, r3);
+  y = v_fma_f64 (z2, y, r2);
+  y = v_fma_f64 (z2, y, r1);
+
+  /* y=erf(x) if x>0, -erf(-x) otherwise.  */
+  y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign);
+
+  if (unlikely (v_any_u64 (special_case)))
+    return specialcase (x, y, special_case);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, erf, -6.0, 6.0)
+PL_TEST_ULP (V_NAME (erf), 1.26)
+PL_TEST_INTERVAL (V_NAME (erf), 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (V_NAME (erf), 0x1p-127, 0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME (erf), -0x1p-127, -0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME (erf), 0x1p-26, 0x1p3, 40000)
+PL_TEST_INTERVAL (V_NAME (erf), -0x1p-26, -0x1p3, 40000)
+PL_TEST_INTERVAL (V_NAME (erf), 0, inf, 40000)
+#endif
diff --git a/pl/math/v_erf_data.c b/pl/math/v_erf_data.c
new file mode 100644
index 0000000..7bbb281
--- /dev/null
+++ b/pl/math/v_erf_data.c
@@ -0,0 +1,119 @@
+/*
+ * Polynomial coefficients and shifts for double-precision erf(x) vector
+ * function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* 48 intervals of the form [x_i, x_{i+1}] with x_i = i / 8 for
+   i=1,...,47 (x_0 = 2^-1022). There is an extra dummy interval for
+   [6, +inf] with all coeffs = 0 except for P_0 = 1.0, as erf(x) == 1
+   above 6.
+
+   Coefficients for each interval generated using fpminimax algorithm. See
+   v_erf.sollya for details. Note the array is transposed, so for a set of
+   coefficients C generated on interval i, C[j] is at coeffs[j][i].  */
+
+const struct v_erf_data __v_erf_data
+  = {.shifts
+     = {-0x1p-1019, -1,	 -2,  -3,  -4,	-5,  -6,  -7,  -8,  -9,	 -10, -11, -12,
+	-13,	    -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25,
+	-26,	    -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38,
+	-39,	    -40, -41, -42, -43, -44, -45, -46, -47, 0},
+     .coeffs = {
+       // clang-format off
+
+{0x1.20dd750429b6dp-1022, 0x1.1f5e1a35c3b8ap-3, 0x1.1af54e232d609p-2, 0x1.9dd0d2b721f39p-2, 0x1.0a7ef5c18edd2p-1, 0x1.3f196dcd0f135p-1,
+ 0x1.6c1c9759d0e5fp-1, 0x1.91724951b8fc6p-1, 0x1.af767a741088bp-1, 0x1.c6dad2829ec62p-1, 0x1.d8865d98abe00p-1, 0x1.e5768c3b4a3fcp-1,
+ 0x1.eea5557137ae0p-1, 0x1.f4f693b67bd77p-1, 0x1.f92d077f8d56dp-1, 0x1.fbe61eef4cf6ap-1, 0x1.fd9ae142795e3p-1, 0x1.fea4218d6594ap-1,
+ 0x1.ff404760319b4p-1, 0x1.ff9960f3eb327p-1, 0x1.ffcaa8f4c9beap-1, 0x1.ffe514bbdc197p-1, 0x1.fff2cfb0453d9p-1, 0x1.fff9ba420e834p-1,
+ 0x1.fffd1ac4135f9p-1, 0x1.fffeb3ebb267bp-1, 0x1.ffff6f9f67e55p-1, 0x1.ffffc316d9ed0p-1, 0x1.ffffe710d565ep-1, 0x1.fffff618c3da6p-1,
+ 0x1.fffffc2f171e3p-1, 0x1.fffffe92ced93p-1, 0x1.ffffff7b91176p-1, 0x1.ffffffd169d0cp-1, 0x1.fffffff01a8b6p-1, 0x1.fffffffabd229p-1,
+ 0x1.fffffffe4fa30p-1, 0x1.ffffffff79626p-1, 0x1.ffffffffd759dp-1, 0x1.fffffffff4188p-1, 0x1.fffffffffc9e8p-1, 0x1.ffffffffff11ap-1,
+ 0x1.ffffffffffc05p-1, 0x1.ffffffffffef8p-1, 0x1.fffffffffffbep-1, 0x1.ffffffffffff0p-1, 0x1.ffffffffffffcp-1, 0x1.fffffffffffffp-1, 1.0},
+
+{0x1.20dd750429b6dp-3, 0x1.1c62fa1e86989p-3, 0x1.0f5d1602f7dfbp-3, 0x1.f5f0cdaf152b2p-4, 0x1.c1efca49a5051p-4, 0x1.86e9694134b22p-4,
+ 0x1.492e42d78d39cp-4, 0x1.0cab61f084b1bp-4, 0x1.a911f096fbb79p-5, 0x1.45e99bcbb78d4p-5, 0x1.e4652fadcbaa3p-6, 0x1.5ce595c455bccp-6,
+ 0x1.e723726b81ff1p-7, 0x1.499d478bca4acp-7, 0x1.b055303221566p-8, 0x1.12ceb37ffa389p-8, 0x1.529b9e8cfa59fp-9, 0x1.94624e78e084fp-10,
+ 0x1.d4143a9e023f5p-11, 0x1.06918b63537c2p-11, 0x1.1d83170fcc34bp-12, 0x1.2ce898808f08ep-13, 0x1.3360ccd26e06ap-14, 0x1.30538fbb986fbp-15,
+ 0x1.2408e9bb1b657p-16, 0x1.0f9e1b4e4baaep-17, 0x1.e9b5e8d71b5e3p-19, 0x1.abe09e85af38ap-20, 0x1.6a5972347c568p-21, 0x1.296a70eff1bd9p-22,
+ 0x1.d9371ee6bfc07p-24, 0x1.6ce1a88a01b3ap-25, 0x1.10b14985663f9p-26, 0x1.8b0d07ade43d8p-28, 0x1.155a098eceb0fp-29, 0x1.7974d3b397e7cp-31,
+ 0x1.f1e3bf5a6493ap-33, 0x1.3e47781d91b97p-34, 0x1.8a7038368986cp-36, 0x1.d9d4d7be5992cp-38, 0x1.137dabebc1319p-39, 0x1.367541123e46cp-41,
+ 0x1.58007ab162c1dp-43, 0x1.709f0d280b3f5p-45, 0x1.30a3dcf531ebfp-47, 0x1.d2707c055dedcp-50, 0x1.0d97f61945387p-49, 0x1.1dbc3ab728933p-50, 0},
+
+{0x1.2411381609db0p-51, -0x1.1c62fa1e75c0ap-9, -0x1.0f5d1602eb436p-8, -0x1.78749a4346714p-8, -0x1.c1efca49a7b15p-8, -0x1.e8a3c39178d95p-8,
+ -0x1.edc5644363883p-8, -0x1.d62beb64e19eep-8, -0x1.a911f096f7a87p-8, -0x1.6ea6cf452dca3p-8, -0x1.2ebf3dccb166cp-8, -0x1.dfbbadedfcde6p-9,
+ -0x1.6d5a95d08c346p-9, -0x1.0bcfca21880c9p-9, -0x1.7a4a8a2bf1a0bp-10, -0x1.01a1c8481a466p-10, -0x1.529b9e8d29ddap-11, -0x1.ada873604cf20p-12,
+ -0x1.074b60f960c25p-12, -0x1.37ccd585732c6p-13, -0x1.64e3dcd73a1d3p-14, -0x1.8af14827e93bap-15, -0x1.a6a519ae712fbp-16, -0x1.b5781ea681265p-17,
+ -0x1.b60d5ed744563p-18, -0x1.a8670acc75c29p-19, -0x1.8de3ce2154088p-20, -0x1.690584329096ap-21, -0x1.3d0e478659a54p-22, -0x1.0d8875cb088d0p-23,
+ -0x1.bba3c56e56d69p-25, -0x1.617a60b4bcd87p-26, -0x1.10b16afb9ce08p-27, -0x1.9766e11f62828p-29, -0x1.26afbc55ef33cp-30, -0x1.9cd52c0e709a9p-32,
+ -0x1.18175f6758766p-33, -0x1.705a68dde7f3ap-35, -0x1.d65ba6d52556dp-37, -0x1.23af5c3865987p-38, -0x1.51c72cd64a6bcp-40, -0x1.79f63bbc02f5ap-42,
+ -0x1.2346f2840d7bfp-43, -0x1.8110f614395a8p-45, 0x1.c3309f1fe85a4p-46, 0x1.09e6fb6ee0b85p-46, -0x1.959834938224fp-46, -0x1.0e9a684ecee47p-46, 0},
+
+{-0x1.812746b057b58p-11, -0x1.6f552dbf96b31p-11, -0x1.3c97445cee1b0p-11, -0x1.e106c523a966dp-12, -0x1.2bf5318638e21p-12, -0x1.c8105034ea92fp-14,
+ 0x1.b6e85963275c5p-15, 0x1.7c9d756585d29p-13, 0x1.1b614b0e78122p-12, 0x1.4cb3cf0b42031p-12, 0x1.571d01cf7eeb3p-12, 0x1.4374d82fe7f2ep-12,
+ 0x1.1c2a02b9199a0p-12, 0x1.d6631e131dabap-13, 0x1.7148c3d9d22bap-13, 0x1.143d1c76ae7c6p-13, 0x1.8b0ae3afc07e6p-14, 0x1.0ea475d5b3822p-14,
+ 0x1.63ef6208bd4adp-15, 0x1.c1ec100ec3e71p-16, 0x1.119da13709716p-16, 0x1.407fbd00318a5p-17, 0x1.69cf481b4666cp-18, 0x1.89e17d2b19c42p-19,
+ 0x1.9db7531fa76f6p-20, 0x1.a37382bd61dc8p-21, 0x1.9aa4a8e8fe8dfp-22, 0x1.8451fcde36f23p-23, 0x1.62cd605193fe9p-24, 0x1.394b0d46af85cp-25,
+ 0x1.0b6c0d1191ec9p-26, 0x1.b9581bcc8f4ebp-28, 0x1.603ea0f602119p-29, 0x1.0ff28bc88022cp-30, 0x1.95ecc71a0b4bep-32, 0x1.24ffe516534d4p-33,
+ 0x1.9aa89abeffd90p-35, 0x1.1ab57210158fap-36, 0x1.8b0c503eafbcbp-38, 0x1.166413b8ba611p-39, 0x1.5848fad1e38e9p-42, 0x1.3573cc6d6d4e6p-49,
+ 0x1.404c0dc8b5ffcp-42, 0x1.38779160f5f11p-43, -0x1.1dc84293acf27p-42, -0x1.2892755467252p-43, 0x1.8e40aed4a9e02p-43, 0x1.0cef3bce98bedp-43, 0},
+
+{0x1.4ade8e6d47ef0p-43, 0x1.196c9ee6491cfp-16, 0x1.040e8be6a9625p-15, 0x1.5529ad049b967p-15, 0x1.76f27e1744b44p-15, 0x1.6963c95cd8395p-15,
+ 0x1.349b5d6ae76a6p-15, 0x1.cc6056b95eed3p-16, 0x1.1b614adacb10dp-16, 0x1.ca5080f4ec9b9p-18, -0x1.93a9d54fb750bp-20, -0x1.f3b8d7695d38cp-18,
+ -0x1.6d5a929bfde5fp-17, -0x1.974c013452be9p-17, -0x1.8a0da620ab60fp-17, -0x1.5a3166e1f5682p-17, -0x1.1a2c5ad80a584p-17, -0x1.afe552a6507eep-18,
+ -0x1.38a9879a760b8p-18, -0x1.ae595d5041755p-19, -0x1.1a89c93c4b9c8p-19, -0x1.62d4c3dc10fdbp-20, -0x1.ab0c620cf63d1p-21, -0x1.ed4aeff35fd90p-22,
+ -0x1.11c8e63fae76dp-22, -0x1.2454a1fb4749ap-23, -0x1.2c7f7846b0e7bp-24, -0x1.298c17acfd63ap-25, -0x1.1c0f6cc5baa18p-26, -0x1.0574c9f0e63fap-27,
+ -0x1.d0a5c4232f4cep-29, -0x1.8d9d301253af8p-30, -0x1.49cb78be34c81p-31, -0x1.08fc30eb50526p-32, -0x1.96e2f50cad458p-34, -0x1.2c888ddad994bp-35,
+ -0x1.c5dd3068e7fcap-37, -0x1.935b876ed56ffp-38, -0x1.e74a7c256ba0dp-39, -0x1.1681c73733b50p-39, 0x1.855ab0b8664dep-41, 0x1.4aebdf7fb67e5p-41,
+ -0x1.2aef07c393759p-40, -0x1.37e52b17505e6p-41, 0x1.394b997da7ed5p-40, 0x1.4345440ea9876p-41, -0x1.af227669dca68p-41, -0x1.23589e4f3cc49p-41, 0},
+
+{0x1.ce2f1b1646d4bp-19, 0x1.aaba29a029bd5p-19, 0x1.47e57fbf662a0p-19, 0x1.74882f55f1bd4p-20, 0x1.dfed759bd9091p-23, -0x1.c124b2acb3ee8p-21,
+ -0x1.b429a82901889p-20, -0x1.1350ee93fbfb3p-19, -0x1.1b613a5e1e196p-19, -0x1.f65ceb61aa63ap-20, -0x1.82814da1daaa1p-20, -0x1.f5729185c040ep-21,
+ -0x1.e72489bfea503p-22, -0x1.17d784c065f21p-24, 0x1.b2229e5122850p-23, 0x1.779b916c44358p-22, 0x1.ace7a08f66cb0p-22, 0x1.9973788b8f181p-22,
+ 0x1.5d3bceb9c39d5p-22, 0x1.11da976499339p-22, 0x1.90eaa0d25df91p-23, 0x1.146c19a9f0ae8p-23, 0x1.693a52f5ccd0bp-24, 0x1.c122683fc1404p-25,
+ 0x1.0a866e311e50ap-25, 0x1.2e85588e08741p-26, 0x1.493501a3ee15cp-27, 0x1.572eec204dc18p-28, 0x1.590e0157d4dabp-29, 0x1.4c0619d7359e8p-30,
+ 0x1.36608b7b22d22p-31, 0x1.0e3f514a0d7fep-32, 0x1.e04d29135056ep-34, 0x1.aa936eb977e33p-35, 0x1.3ce1ec4a299b6p-36, 0x1.aba42bc751130p-38,
+ 0x1.0861b5dc819e3p-38, 0x1.3bc7b1f0f8afbp-38, 0x1.7d6c896bf3579p-38, 0x1.14f24be91338cp-38, -0x1.2896024cf2ca9p-39, -0x1.c2e8399d1e8e7p-40,
+ 0x1.7836a61cc0f4bp-39, 0x1.8a98e07f8cdfcp-40, -0x1.8f332379c6ce4p-39, -0x1.9bbec3ab83755p-40, 0x1.126c9c6d24bd6p-39, 0x1.72eaeac065cc2p-40, 0},
+
+{0x1.240b25b9a9823p-39, -0x1.733f879c52150p-24, -0x1.4c00873f3742fp-23, -0x1.9a6fe48163775p-23, -0x1.99ed7481d2399p-23, -0x1.52aea61425cf7p-23,
+ -0x1.b853c3ad1c781p-24, -0x1.53c3e486c1845p-25, 0x1.2e2a4e7a0286dp-26, 0x1.fd0e266132929p-25, 0x1.5cf1d8fe5611fp-24, 0x1.6b140ba72ac56p-24,
+ 0x1.3cab2fa73a9c4p-24, 0x1.d864967df5009p-25, 0x1.25b4551256078p-25, 0x1.0d029bc50b0cdp-26, 0x1.e126485c5dceep-30, -0x1.dd5e4bed818c0p-28,
+ -0x1.7cd1b44dbfdc3p-27, -0x1.981def704f39ep-27, -0x1.6f0e87a0f3e35p-27, -0x1.267c0dc9b6e95p-27, -0x1.b2ec3078bf153p-28, -0x1.2b066605239f5p-28,
+ -0x1.840473ed3d070p-29, -0x1.daf9b9b8c06cap-30, -0x1.1661520cf8a32p-30, -0x1.2fa49c29e30b5p-31, -0x1.4ddfd9d6a7cf4p-32, -0x1.4a55b8564425ap-33,
+ -0x1.5df1ca746f291p-34, -0x1.dd6b8d1ec2e4fp-36, -0x1.34c63d902f888p-36, -0x1.b55b65a1655c0p-37, -0x1.9c1cfd1e2142cp-39, 0x1.98f2b73f288c4p-43,
+ -0x1.3baba91a10af8p-39, -0x1.8cb03e5359e2bp-38, -0x1.16063ce2129afp-37, -0x1.9fd74120d8e00p-38, 0x1.cf0caf7defe71p-39, 0x1.5d029f324f3a7p-39,
+ -0x1.21268c2290cb5p-38, -0x1.2f6de12d74afdp-39, 0x1.332ead763d55ap-38, 0x1.3cd3a7103e138p-39, -0x1.a64e5d1cdb028p-39, -0x1.1d674b3db2a42p-39, 0},
+
+{-0x1.b84a0abf33534p-27, -0x1.89c6cd0cf2b65p-27, -0x1.09bb37091d4aep-27, -0x1.68f777b72ca95p-29, 0x1.60a5240c5ece1p-29, 0x1.c7421c28ef551p-28,
+ 0x1.2e75b6acb2116p-27, 0x1.30f14412b258cp-27, 0x1.f153992d28a09p-28, 0x1.3b80153a3c97bp-28, 0x1.df36fe4b5094cp-30, -0x1.724a2b185f507p-31,
+ -0x1.37cb36ce4237dp-29, -0x1.963d70f677f90p-29, -0x1.8d5c135b0af66p-29, -0x1.42fbc01c11a3bp-29, -0x1.baba060b7adb1p-30, -0x1.eaf481fbc6feap-31,
+ -0x1.5b5d0a354e49cp-32, 0x1.fb57bbdb6f854p-35, 0x1.2423823b5dcaep-32, 0x1.64e9c7f44ececp-32, 0x1.59b6fb115bcefp-32, 0x1.179a1737c24d9p-32,
+ 0x1.a9515bcf95bb0p-33, 0x1.1ca83baba64bdp-33, 0x1.826e7ef89b3cap-34, 0x1.7ab5cb5ca2db0p-35, 0x1.2ce997226e82dp-35, 0x1.fdd14ca5a6d38p-37,
+ 0x1.d35252de2a363p-37, -0x1.8dd5e799b3695p-39, 0x1.047fd46786432p-38, 0x1.aa8639c65a4a4p-38, 0x1.10495d2cdaee5p-41, -0x1.24b2b7e751230p-40,
+ 0x1.e2ec0b9e9b211p-40, 0x1.6203cc50754ffp-38, 0x1.f95c0def7238bp-38, 0x1.7b31a463405b9p-38, -0x1.a826fa90b3c96p-39, -0x1.3f6315812b719p-39,
+ 0x1.0862d42832ac6p-38, 0x1.1575d5fa4614cp-39, -0x1.18eb527929cedp-38, -0x1.21bd844e0e3b8p-39, 0x1.8233e415548a0p-39, 0x1.0501b16f5819bp-39, 0},
+
+{0x1.9b4497171a29dp-39, 0x1.7f9c0bcd4b3e7p-32, 0x1.4928133bccac3p-31, 0x1.7b5a70f49485bp-31, 0x1.4f71ee2c4aff3p-31, 0x1.bca22e6a9cd38p-32,
+ 0x1.1c93a34970852p-33, -0x1.03d86c164d20cp-33, -0x1.448222383eb95p-32, -0x1.95aa76b3417ddp-32, -0x1.80448ecd34689p-32, -0x1.19d3f547d1f1fp-32,
+ -0x1.2c65995a6a63fp-33, -0x1.01b5832823cc6p-35, 0x1.97d70f56a4524p-35, 0x1.7d57df58d20a9p-34, 0x1.a3d6fe32773b9p-34, 0x1.6ff53581ac827p-34,
+ 0x1.faff84d277a6fp-35, 0x1.39ff19e23455bp-35, 0x1.9b1e383b8e03dp-37, 0x1.fd37bce839816p-40, -0x1.31b58a910d109p-37, -0x1.480a28743a67fp-37,
+ -0x1.9a8b926ca51b4p-37, -0x1.14d6b0b9c8256p-37, -0x1.227dfd10a7f51p-37, -0x1.d1d5ba9e5676cp-42, -0x1.71c57d72b90eap-38, -0x1.018922e3bb1eap-40,
+ -0x1.e0970faab38e6p-39, 0x1.a442b8ab5ed33p-39, -0x1.3a6f0acbd7293p-40, -0x1.7c53be7062a3ap-39, -0x1.c562622693573p-44, 0x1.458e668db57cdp-41,
+ -0x1.d5f41a61e90a0p-41, -0x1.60d1f7c57cb11p-39, -0x1.f8fa4c98324fep-39, -0x1.7b178840b90e3p-39, 0x1.a8558cdf5220ap-40, 0x1.3f7acb241cdbbp-40,
+ -0x1.086dc81118428p-39, -0x1.15828db8b2da6p-40, 0x1.18f9d5a5099c3p-39, 0x1.21cd05249b8c9p-40, -0x1.82493a2d7a1fep-40, -0x1.0510a8a58c1abp-40, 0},
+
+{0x1.4c0cf8eccd2e0p-35, 0x1.de696ed8004cbp-36, 0x1.62392d5363e58p-37, -0x1.21d68e1a8e4c7p-37, -0x1.867b57075ec9dp-36, -0x1.058af4c30abafp-35,
+ -0x1.dbb6594ed5127p-36, -0x1.6006d1f354794p-36, -0x1.311e96adfec96p-37, 0x1.2c82e5ef56703p-39, 0x1.6f2c1413cbe8ep-37, 0x1.c46886dd6c5d6p-37,
+ 0x1.92e273bf63d54p-37, 0x1.2982faf5df034p-37, 0x1.5ad37b1dc30c4p-38, 0x1.97104fd2630f8p-40, -0x1.38bcd955ecbb9p-40, -0x1.7779727d36c91p-39,
+ -0x1.4862c13c3ccf5p-39, -0x1.53facd6319433p-39, -0x1.de2f6e88b0926p-41, -0x1.fb0967f0fa611p-41, 0x1.5fadb405af344p-42, 0x1.e90319ef64411p-43,
+ 0x1.fc013fac4d3d7p-41, 0x1.0546d08a05cacp-41, 0x1.fa1b10c35012ep-41, -0x1.000d4354b8049p-41, 0x1.b68ee44b2b84bp-41, 0x1.cfa36d83ea2afp-48,
+ 0x1.5c41a6c8aaf3ap-41, -0x1.7edb2342ceb28p-41, 0x1.d9211942a37d9p-43, 0x1.39b815d399ba2p-41, 0x1.1fc46969db91bp-46, -0x1.1736507c25bafp-43,
+ 0x1.89bbcfdb5c677p-43, 0x1.28f22b295bc86p-41, 0x1.a9396e0b45a3bp-41, 0x1.3f409ac2dbfafp-41, -0x1.65682520f07a7p-42, -0x1.0d1586492d3b1p-42,
+ 0x1.bd6c9f236abc3p-42, 0x1.d376a4bd795bep-43, -0x1.d94e87dd31275p-42, -0x1.e82d04ff5649fp-43, 0x1.455b18d5d810fp-42, 0x1.b7c6a4ab711bdp-43, 0}
+       // clang-format on
+     }};
diff --git a/pl/math/v_erfc_4u.c b/pl/math/v_erfc_4u.c
new file mode 100644
index 0000000..c306351
--- /dev/null
+++ b/pl/math/v_erfc_4u.c
@@ -0,0 +1,168 @@
+/*
+ * Double-precision vector erfc(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "horner.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+/* Accurate exponential (vector variant of exp_dd).  */
+v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t);
+
+#define One v_f64 (1.0)
+#define AbsMask v_u64 (0x7fffffffffffffff)
+#define Scale v_f64 (0x1.0000002p27)
+
+/* Coeffs for polynomial approximation on [0x1.0p-28., 31.].  */
+#define PX __v_erfc_data.poly
+#define xint __v_erfc_data.interval_bounds
+
+/* Special cases (fall back to scalar calls).  */
+VPCS_ATTR
+NOINLINE static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+{
+  return v_call_f64 (erfc, x, y, cmp);
+}
+
+/* A structure to perform look-up in coeffs and other parameter
+   tables.  */
+struct entry
+{
+  v_f64_t P[ERFC_POLY_ORDER + 1];
+  v_f64_t xi;
+};
+
+static inline struct entry
+lookup (v_u64_t i)
+{
+  struct entry e;
+#ifdef SCALAR
+  for (int j = 0; j <= ERFC_POLY_ORDER; ++j)
+    e.P[j] = PX[i][j];
+  e.xi = xint[i];
+#else
+  for (int j = 0; j <= ERFC_POLY_ORDER; ++j)
+    {
+      e.P[j][0] = PX[i[0]][j];
+      e.P[j][1] = PX[i[1]][j];
+    }
+  e.xi[0] = xint[i[0]];
+  e.xi[1] = xint[i[1]];
+#endif
+  return e;
+}
+
+/* Accurate evaluation of exp(x^2) using compensated product
+   (x^2 ~ x*x + e2) and custom exp(y+d) routine for small
+   corrections d<<y.  */
+static inline v_f64_t
+v_eval_gauss (v_f64_t a)
+{
+  v_f64_t e2;
+  v_f64_t a2 = a * a;
+
+  /* TwoProduct (Dekker) applied to a * a.  */
+  v_f64_t a_hi = -v_fma_f64 (Scale, a, -a);
+  a_hi = v_fma_f64 (Scale, a, a_hi);
+  v_f64_t a_lo = a - a_hi;
+
+  /* Now assemble error term.  */
+  e2 = v_fma_f64 (-a_hi, a_hi, a2);
+  e2 = v_fma_f64 (-a_hi, a_lo, e2);
+  e2 = v_fma_f64 (-a_lo, a_hi, e2);
+  e2 = v_fma_f64 (-a_lo, a_lo, e2);
+
+  /* Fast and accurate evaluation of exp(-a2 + e2) where e2 << a2.  */
+  return V_NAME (exp_tail) (-a2, e2);
+}
+
+/* Optimized double precision vector complementary error function erfc.
+   Maximum measured error is 3.64 ULP:
+   __v_erfc(0x1.4792573ee6cc7p+2) got 0x1.ff3f4c8e200d5p-42
+				 want 0x1.ff3f4c8e200d9p-42.  */
+VPCS_ATTR
+v_f64_t V_NAME (erfc) (v_f64_t x)
+{
+  v_f64_t z, p, y;
+  v_u64_t ix, atop, sign, i, cmp;
+
+  ix = v_as_u64_f64 (x);
+  /* Compute fac as early as possible in order to get best performance.  */
+  v_f64_t fac = v_as_f64_u64 ((ix >> 63) << 62);
+  /* Use 12-bit for small, nan and inf case detection.  */
+  atop = (ix >> 52) & 0x7ff;
+  cmp = v_cond_u64 (atop - v_u64 (0x3cd) >= v_u64 (0x7ff - 0x3cd));
+
+  struct entry dat;
+
+  /* All entries of the vector are out of bounds, take a short path.
+     Use smallest possible number above 28 representable in 12 bits.  */
+  v_u64_t out_of_bounds = v_cond_u64 (atop >= v_u64 (0x404));
+
+  /* Use sign to produce either 0 if x > 0, 2 otherwise.  */
+  if (v_all_u64 (out_of_bounds) && likely (v_any_u64 (~cmp)))
+    return fac;
+
+  /* erfc(|x|) = P(|x|-x_i)*exp(-x^2).  */
+
+  v_f64_t a = v_abs_f64 (x);
+
+  /* Interval bounds are a logarithmic scale, i.e. interval n has
+     lower bound 2^(n/4) - 1. Use the exponent of (|x|+1)^4 to obtain
+     the interval index.  */
+  v_f64_t xp1 = a + v_f64 (1.0);
+  xp1 = xp1 * xp1;
+  xp1 = xp1 * xp1;
+  v_u64_t ixp1 = v_as_u64_f64 (xp1);
+  i = (ixp1 >> 52) - v_u64 (1023);
+
+  /* Index cannot exceed number of polynomials.  */
+#ifdef SCALAR
+  i = i <= (ERFC_NUM_INTERVALS) ? i : ERFC_NUM_INTERVALS;
+#else
+  i = (v_u64_t){i[0] <= ERFC_NUM_INTERVALS ? i[0] : ERFC_NUM_INTERVALS,
+		i[1] <= ERFC_NUM_INTERVALS ? i[1] : ERFC_NUM_INTERVALS};
+#endif
+  /* Get coeffs of i-th polynomial.  */
+  dat = lookup (i);
+
+  /* Evaluate Polynomial: P(|x|-x_i).  */
+  z = a - dat.xi;
+#define C(i) dat.P[i]
+  p = HORNER_12 (z, C);
+
+  /* Evaluate Gaussian: exp(-x^2).  */
+  v_f64_t e = v_eval_gauss (a);
+
+  /* Copy sign.  */
+  sign = v_as_u64_f64 (x) & ~AbsMask;
+  p = v_as_f64_u64 (v_as_u64_f64 (p) ^ sign);
+
+  /* Assemble result as 2.0 - p * e if x < 0, p * e otherwise.  */
+  y = v_fma_f64 (p, e, fac);
+
+  /* No need to fix value of y if x is out of bound, as
+     P[ERFC_NUM_INTERVALS]=0.  */
+  if (unlikely (v_any_u64 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, erfc, -6.0, 28.0)
+PL_TEST_ULP (V_NAME (erfc), 3.15)
+PL_TEST_INTERVAL (V_NAME (erfc), 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (V_NAME (erfc), 0x1p-1022, 0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME (erfc), -0x1p-1022, -0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME (erfc), 0x1p-26, 0x1p5, 40000)
+PL_TEST_INTERVAL (V_NAME (erfc), -0x1p-26, -0x1p3, 40000)
+PL_TEST_INTERVAL (V_NAME (erfc), 0, inf, 40000)
+#endif
diff --git a/pl/math/v_erfc_data.c b/pl/math/v_erfc_data.c
new file mode 100644
index 0000000..3c47033
--- /dev/null
+++ b/pl/math/v_erfc_data.c
@@ -0,0 +1,96 @@
+/*
+ * Polynomial coefficients for double-precision erfc(x) vector function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Coefficients for 20 order-12 polynomials used in v_erfc. The intervals have
+   the same bounds as the scalar algorithm, with the exception of the lower
+   bound of the first interval which is larger. This is because the vector
+   variants fall back to the scalar for tiny arguments, meaning that we can use
+   a slightly different approach which is more precise for larger inputs but
+   unacceptably imprecise for tiny inputs.  */
+
+const struct v_erfc_data __v_erfc_data = {
+
+/* Bounds for 20 intervals spanning [0x1.0p-28., 31.]. Interval bounds are a
+   logarithmic scale, i.e. interval n has lower bound 2^(n/4) - 1, with the
+   exception of the first interval.  */
+.interval_bounds = {
+  0x1p-28,		/* If xmin=2^-28, 0 otherwise.  */
+  0x1.837f0518db8a9p-3, /* 0.189.  */
+  0x1.a827999fcef32p-2, /* 0.414.  */
+  0x1.5d13f32b5a75bp-1, /* 0.682.  */
+  0x1.0p0,		/* 1.000.  */
+  0x1.60dfc14636e2ap0,	/* 1.378.  */
+  0x1.d413cccfe779ap0,	/* 1.828.  */
+  0x1.2e89f995ad3adp1,	/* 2.364.  */
+  0x1.8p1,		/* 3.000.  */
+  0x1.e0dfc14636e2ap1,	/* 3.757.  */
+  0x1.2a09e667f3bcdp2,	/* 4.657.  */
+  0x1.6e89f995ad3adp2,	/* 5.727.  */
+  0x1.cp2,		/* 7.000.  */
+  0x1.106fe0a31b715p3,	/* 8.514.  */
+  0x1.4a09e667f3bcdp3,	/* 10.31.  */
+  0x1.8e89f995ad3adp3,	/* 12.45.  */
+  0x1.ep3,		/* 15.00.  */
+  0x1.206fe0a31b715p4,	/* 18.03.  */
+  0x1.5a09e667f3bcdp4,	/* 21.63.  */
+  0x1.9e89f995ad3adp4,	/* 25.91.  */
+  0x1.fp4		/* 31.00.  */
+},
+
+/* Generated using fpminimax algorithm on each interval separately. The
+   polynomial approximates erfc(x + a) * exp((x + a) ^ 2) in the interval
+   [0;b-a], where [a;b] is the interval in which the input lies. Note this is
+   slightly different from the scalar polynomial, which approximates
+   erfc(x + a) * exp(x ^ 2). See v_erfc.sollya for more details.  */
+.poly = {
+/* 3.725290298461914e-9 < x < 0.18920711500272103.  */
+{0x1.ffffffdbe4516p-1, -0x1.20dd74e429b54p0, 0x1.ffffffb7c6a67p-1, -0x1.8127466fa2ec9p-1, 0x1.ffffff6eeff5ap-2, -0x1.341f668c90dccp-2, 0x1.5554aca74e5d6p-3, -0x1.6014d9d3fed0dp-4, 0x1.546b5f2c85127p-5, -0x1.2f7ec79acc129p-6, 0x1.a27e53703b7abp-8, 0x1.7b18bce311fa3p-12, -0x1.1897cda04df3ap-9},
+/* 0.18920711500272103 < x < 0.41421356237309515.  */
+{0x1.a2b43de077724p-1, -0x1.a3495bb58664cp-1, 0x1.535f3ff4547e6p-1, -0x1.d96eea2951a7cp-2, 0x1.269566a956371p-2, -0x1.4e281de026b47p-3, 0x1.5ea071b652a2fp-4, -0x1.57f46cfca7024p-5, 0x1.3db28243f06abp-6, -0x1.138745eef6f26p-7, 0x1.a9cd70bad344p-9, -0x1.c6e4fda8920c4p-11, 0x1.624709ca2bc71p-16},
+/* 0.41421356237309515 < x < 0.681792830507429.  */
+{0x1.532e75764e513p-1, -0x1.28be34f327f9dp-1, 0x1.b088738cca84cp-2, -0x1.14377551bd5c8p-2, 0x1.3e1ecedd64246p-3, -0x1.5087f3110eb57p-4, 0x1.4b3c61efcb562p-5, -0x1.324cc70a4f459p-6, 0x1.0cd19a96af21bp-7, -0x1.cc2ccc725d07p-9, 0x1.a3ba67a7d02b4p-10, -0x1.b1943295882abp-11, 0x1.53a1c5fdf8e67p-12},
+/* 0.681792830507429 < x < 1.  */
+{0x1.10f974588f63dp-1, -0x1.9b032139e3367p-2, 0x1.09b942b8a951dp-2, -0x1.327553909cb88p-3, 0x1.42819b6c9a14p-4, -0x1.3a6d6f1924825p-5, 0x1.1f1864dd6f28fp-6, -0x1.ef12c5e9f3232p-8, 0x1.962ac63d55aa1p-9, -0x1.4146d9206419cp-10, 0x1.f823f62268229p-12, -0x1.837ab488d5ed8p-13, 0x1.aa021ae16edfep-15},
+/* 1 < x < 1.378414230005442.  */
+{0x1.b5d8780f956b2p-2, -0x1.17c4e3f17c034p-2, 0x1.3c27283c31939p-3, -0x1.44837f88a0ecdp-4, 0x1.33cad0dc779c8p-5, -0x1.10fcef8294e8dp-6, 0x1.c8cb3e5a6a5a6p-8, -0x1.6aedbd3a05f1cp-9, 0x1.1325c0bf9a0cap-10, -0x1.8e28d61a0f646p-12, 0x1.0d554e2ab3652p-13, -0x1.35b5f9ac296ebp-15, 0x1.b8faf07e2527dp-18},
+/* 1.378414230005442 < x < 1.8284271247461903.  */
+{0x1.5ee444130b7dbp-2, -0x1.78396ab2083e8p-3, 0x1.6e617ec5bc039p-4, -0x1.49e60f6238765p-5, 0x1.16064fb4428c9p-6, -0x1.ba80a8575a434p-8, 0x1.4ec30f2efeb8p-9, -0x1.e40456c735f09p-11, 0x1.4f7ee6b7885b7p-12, -0x1.bc9997995fdecp-14, 0x1.1169f7327ff2p-15, -0x1.174826d000852p-17, 0x1.5506a7433e925p-20},
+/* 1.8284271247461903 < x < 2.363585661014858.  */
+{0x1.19a22c064d4eap-2, -0x1.f645498cae1b3p-4, 0x1.a0565950e1256p-5, -0x1.446605c186f6dp-6, 0x1.df1231b47ff04p-8, -0x1.515164d13dfafp-9, 0x1.c72bde869ad61p-11, -0x1.2768fbf9b1d6ep-12, 0x1.71bd3a1b851e9p-14, -0x1.bca5b5942017cp-16, 0x1.f2d480b3a2e63p-18, -0x1.d339662d53467p-20, 0x1.06d67ebf792bp-22},
+/* 2.363585661014858 < x < 3.  */
+{0x1.c57f0542a7637p-3, -0x1.4e5535c17af25p-4, 0x1.d31272523acfep-6, -0x1.3727cbbfd1bfcp-7, 0x1.8d6730b8c5a4cp-9, -0x1.e88548286036fp-11, 0x1.21f6e89456853p-12, -0x1.4d4b7787bd3c2p-14, 0x1.735dc84e7ff16p-16, -0x1.8eb02db832048p-18, 0x1.8dfb8add3b86ep-20, -0x1.47a340d76c72bp-22, 0x1.3e5925ffebe6bp-25},
+/* 3 < x < 3.756828460010884.  */
+{0x1.6e9827d229d2dp-3, -0x1.bd6ae4d14b1adp-5, 0x1.043fe1a98c3b9p-6, -0x1.259061ba34453p-8, 0x1.409cc2cc96bedp-10, -0x1.53dec3fd6c443p-12, 0x1.5e72f7baf3554p-14, -0x1.601aa94bf21eep-16, 0x1.58e730ceaa91dp-18, -0x1.4762cbd256163p-20, 0x1.22b8bea5d4a5ap-22, -0x1.ac197af37fcadp-25, 0x1.74cdf138a0b73p-28},
+/* 3.756828460010884 < x < 4.656854249492381.  */
+{0x1.29a8a4e95063ep-3, -0x1.29a8a316d331dp-5, 0x1.21876b3fe50cfp-7, -0x1.1276f2d8eefd9p-9, 0x1.fbff521741e5cp-12, -0x1.cb9ce996b9601p-14, 0x1.971075371ef81p-16, -0x1.61458571e4738p-18, 0x1.2c51c21b7ab9ep-20, -0x1.f01e444a666c3p-23, 0x1.7e8f2979b67f1p-25, -0x1.e505367843027p-28, 0x1.67809d68de49cp-31},
+/* 4.656854249492381 < x < 5.727171322029716.  */
+{0x1.e583024e2bc7fp-4, -0x1.8fb458acb5acep-6, 0x1.42b9dffac075cp-8, -0x1.ff9fe9a48522p-11, 0x1.8e7e866f4f073p-13, -0x1.313aeee1c2d45p-15, 0x1.cc299efd7374cp-18, -0x1.5587e53442d66p-20, 0x1.f2aca160f159bp-23, -0x1.62ae4834dcda7p-25, 0x1.d6b070147cb37p-28, -0x1.fee399e7be1bfp-31, 0x1.41d6f9fbc9515p-34},
+/* 5.727171322029716 < x < 7.  */
+{0x1.8d9cbafa30408p-4, -0x1.0dd14614ed1cfp-6, 0x1.6943976ea6bf4p-9, -0x1.dd6f05f3b914cp-12, 0x1.37891317e7bcfp-14, -0x1.91a81ce9014a2p-17, 0x1.ffcac303208b9p-20, -0x1.424f1af78feb3p-22, 0x1.90b8edbca12a5p-25, -0x1.e69bea0338c7fp-28, 0x1.13b974a710373p-30, -0x1.fdc9aa9359794p-34, 0x1.105fc772b5a66p-37},
+/* 7 < x < 8.513656920021768.  */
+{0x1.46dc6bf900f68p-4, -0x1.6e4b45246f95p-7, 0x1.96a3de47d4bd7p-10, -0x1.bf5070eccb409p-13, 0x1.e7af6e83607a2p-16, -0x1.078bf5306f9eep-18, 0x1.1a6e8327243adp-21, -0x1.2c1e7368c7809p-24, 0x1.3bc83557dac43p-27, -0x1.45a6405b2e649p-30, 0x1.3aac4888689ebp-33, -0x1.f1fa23448a168p-37, 0x1.c868668755778p-41},
+/* 8.513656920021768 < x < 10.313708498984761.  */
+{0x1.0d9a17e032288p-4, -0x1.f3e942ff4df7p-8, 0x1.cc77f09dabc5cp-11, -0x1.a56e8bfd32da8p-14, 0x1.7f49e31164409p-17, -0x1.5a73f46a6afc9p-20, 0x1.374240ce973d2p-23, -0x1.15e8d473b728cp-26, 0x1.ec3ec79699378p-30, -0x1.ab3b8aba63362p-33, 0x1.5a1381cfe2866p-36, -0x1.c78e252ce77ccp-40, 0x1.589857ceaaaeep-44},
+/* 10.313708498984761 < x < 12.454342644059432.  */
+{0x1.be0c73cc19eddp-5, -0x1.56ce6f6c0cbb1p-8, 0x1.0645980ecbbfcp-11, -0x1.8f86f887f6598p-15, 0x1.2ef80cd9e00b1p-18, -0x1.c97ffd66720e4p-22, 0x1.57f0eeecf030ap-25, -0x1.016df7d5e28d9p-28, 0x1.7f0d022922f1dp-32, -0x1.1849731f004aep-35, 0x1.8149e7ca0fb3cp-39, -0x1.b1fe4abe62d81p-43, 0x1.1ae4d60247651p-47},
+/* 12.454342644059432 < x < 15.  */
+{0x1.71eafbd9f5877p-5, -0x1.d83714d90461fp-9, 0x1.2c74dbacd45fdp-12, -0x1.7d27f3cfe160ep-16, 0x1.e20b13b8d32e3p-20, -0x1.2fe33cb2bce33p-23, 0x1.7dfd564d69a07p-27, -0x1.dea62ef0f7d7ep-31, 0x1.2a7b946273ea5p-34, -0x1.6eb665bad5b72p-38, 0x1.a8191750e8bf9p-42, -0x1.92d8a86cbd0fcp-46, 0x1.bba272feef841p-51},
+/* 15 < x < 18.027313840043536.  */
+{0x1.33714a024097ep-5, -0x1.467f441a50bc3p-9, 0x1.59fa2994c6f7ap-13, -0x1.6dd369d642b7dp-17, 0x1.81fb2aaf2e37p-21, -0x1.966040990b623p-25, 0x1.aaee55e15a079p-29, -0x1.bf756fc8ef04p-33, 0x1.d2daf554e0157p-37, -0x1.dec63e10d317p-41, 0x1.cae915bab7704p-45, -0x1.6537fbb62a8edp-49, 0x1.3f14bd5531da8p-54},
+/* 18.027313840043536 < x < 21.627416997969522.  */
+{0x1.fff97acd75487p-6, -0x1.c502e8e46eb81p-10, 0x1.903b065062756p-14, -0x1.6110aa5e81885p-18, 0x1.36fd4c13c4f1fp-22, -0x1.11848650be987p-26, 0x1.e06596bf6a27p-31, -0x1.a527876771d55p-35, 0x1.6fe1b92a40eb8p-39, -0x1.3c6eb50b23bc6p-43, 0x1.fead2230125dp-48, -0x1.5073427c5207dp-52, 0x1.ff420973fa51dp-58},
+/* 21.627416997969522 < x < 25.908685288118864.  */
+{0x1.aaf347fc8c45bp-6, -0x1.3b2fd709cf8e5p-10, 0x1.d0ddfb858b60ap-15, -0x1.5673f4a8bb08ep-19, 0x1.f80488e89ddb9p-24, -0x1.728391905fcf3p-28, 0x1.101538d7e30bap-32, -0x1.8f16f49d0fa3bp-37, 0x1.23bbaea534034p-41, -0x1.a40119533ee1p-46, 0x1.1b75770e435fdp-50, -0x1.3804bdeb33efdp-55, 0x1.8ba4e7838a4dp-61},
+/* 25.908685288118864 < x < 31.  */
+{0x1.64839d636f92bp-6, -0x1.b7adf753623afp-11, 0x1.0eec0b635a0c4p-15, -0x1.4da09b802ef48p-20, 0x1.9a8b149f5ddf1p-25, -0x1.f8d1f722c65bap-30, 0x1.36247d9a20e19p-34, -0x1.7cbd25180c1d3p-39, 0x1.d243c7a5c8331p-44, -0x1.19e00cc6b1e08p-48, 0x1.418cb6823f2d9p-53, -0x1.2dfdc526c43acp-58, 0x1.49885a987486fp-64},
+/* Dummy interval for x>31 */
+{0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0,
+ 0x0p0, 0x0p0, 0x0p0}
+}
+};
diff --git a/pl/math/v_erfcf_1u.c b/pl/math/v_erfcf_1u.c
new file mode 100644
index 0000000..963490d
--- /dev/null
+++ b/pl/math/v_erfcf_1u.c
@@ -0,0 +1,183 @@
+/*
+ * Single-precision vector erfc(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "erfcf.h"
+#include "estrin.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define P(ia12) __erfcf_poly_data.poly[interval_index (ia12)]
+
+VPCS_ATTR v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t);
+
+static VPCS_ATTR NOINLINE v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+  return v_call_f32 (erfcf, x, y, special);
+}
+
+static inline uint32_t
+interval_index (uint32_t ia12)
+{
+  // clang-format off
+  return (ia12 < 0x400 ? 0 :
+         (ia12 < 0x408 ? 1 :
+         (ia12 < 0x410 ? 2 :
+                         3)));
+  // clang-format on
+}
+
+/* The C macro wraps the coeffs argument in order to make the
+   poynomial evaluation more readable. In the scalarised variant the
+   second pointer is ignored.  */
+#ifdef SCALAR
+#define C(i) coeff1[i]
+#else
+#define C(i) ((v_f64_t){coeff1[i], coeff2[i]})
+#endif
+
+static inline v_f64_t
+v_approx_erfcf_poly_gauss (v_f64_t x, const double *coeff1,
+			   const double *coeff2)
+{
+  v_f64_t x2 = x * x;
+  v_f64_t x4 = x2 * x2;
+  v_f64_t poly = ESTRIN_15 (x, x2, x4, x4 * x4, C);
+  v_f64_t gauss = V_NAME (exp_tail) (-(x * x), v_f64 (0.0));
+  return poly * gauss;
+}
+
+static inline float
+approx_poly_gauss (float abs_x, const double *coeff)
+{
+  return (float) (eval_poly (abs_x, coeff) * eval_exp_mx2 (abs_x));
+}
+
+static v_f32_t
+v_approx_erfcf (v_f32_t abs_x, v_u32_t sign, v_u32_t ia12, v_u32_t lanes)
+{
+#ifdef SCALAR
+  float y = approx_poly_gauss (abs_x, P (ia12));
+  return sign ? 2 - y : y;
+#else
+  float32x2_t lo32 = {0, 0};
+  float32x2_t hi32 = {0, 0};
+  /* The polynomial and Gaussian components must be calculated in
+     double precision in order to meet the required ULP error. This
+     means we have to promote low and high halves of the
+     single-precision input vector to two separate double-precision
+     input vectors. This incurs some overhead, and there is also
+     overhead to loading the polynomial coefficients as this cannot be
+     done in a vector fashion. This would be wasted effort for
+     elements which lie in the 'boring' zone, as they will be
+     overwritten later. Hence we use the lanes parameter to only do
+     the promotion on a pair of lanes if both of those lanes are
+     interesting and not special cases. If one lane is inactive, we
+     use a scalar routine which is shared with the scalar variant.  */
+  if (lanes[0] & lanes[1])
+    {
+      lo32 = vcvt_f32_f64 (
+	v_approx_erfcf_poly_gauss (vcvt_f64_f32 (vget_low_f32 (abs_x)),
+				   P (ia12[0]), P (ia12[1])));
+    }
+  else if (lanes[0])
+    {
+      lo32[0] = approx_poly_gauss (abs_x[0], P (ia12[0]));
+    }
+  else if (lanes[1])
+    {
+      lo32[1] = approx_poly_gauss (abs_x[1], P (ia12[1]));
+    }
+
+  if (lanes[2] & lanes[3])
+    {
+      hi32
+	= vcvt_f32_f64 (v_approx_erfcf_poly_gauss (vcvt_high_f64_f32 (abs_x),
+						   P (ia12[2]), P (ia12[3])));
+    }
+  else if (lanes[2])
+    {
+      hi32[0] = approx_poly_gauss (abs_x[2], P (ia12[2]));
+    }
+  else if (lanes[3])
+    {
+      hi32[1] = approx_poly_gauss (abs_x[3], P (ia12[3]));
+    }
+
+  v_f32_t y = vcombine_f32 (lo32, hi32);
+
+  if (v_any_u32 (sign))
+    {
+      y = vbslq_f32 (vceqzq_u32 (sign), y, 2 - y);
+    }
+
+  return y;
+#endif
+}
+
+/* Optimized single-precision vector complementary error function
+   erfcf. Max measured error: 0.750092 at various values between
+   -0x1.06521p-20 and -0x1.add1dap-17. For example:
+   __v_erfc(-0x1.08185p-18) got 0x1.00004cp+0 want 0x1.00004ap+0
+   +0.249908 ulp err 0.250092.  */
+VPCS_ATTR
+v_f32_t V_NAME (erfcf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t ia = ix & 0x7fffffff;
+  v_u32_t ia12 = ia >> 20;
+  v_u32_t sign = ix >> 31;
+  v_u32_t inf_ia12 = v_u32 (0x7f8);
+
+  v_u32_t special_cases
+    = v_cond_u32 ((ia12 - 0x328) >= ((inf_ia12 & 0x7f8) - 0x328));
+  v_u32_t in_bounds
+    = v_cond_u32 ((ia < 0x408ccccd) | (~sign & (ix < 0x4120f5c3)));
+  v_f32_t boring_zone = v_as_f32_u32 (sign << 30);
+
+#ifdef SCALAR
+  if (unlikely (special_cases))
+    {
+      if (ia12 >= 0x7f8)
+	return (float) (sign << 1) + 1.0f / x; /* Special cases.  */
+      else
+	return 1.0f - x; /* Small case.  */
+    }
+  else if (likely (!in_bounds))
+    {
+      return sign ? boring_zone : __math_uflowf (boring_zone);
+    }
+#endif
+
+  v_f32_t y = v_approx_erfcf (v_as_f32_u32 (ia), sign, ia12,
+			      in_bounds & ~special_cases);
+
+#ifndef SCALAR
+  y = vbslq_f32 (~in_bounds, boring_zone, y);
+
+  if (unlikely (v_any_u32 (special_cases)))
+    {
+      return specialcase (x, y, special_cases);
+    }
+#endif
+
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, erfc, -6.0, 28.0)
+PL_TEST_ULP (V_NAME (erfcf), 0.26)
+PL_TEST_INTERVAL (V_NAME (erfcf), 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-127, 0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-127, -0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-26, 0x1p5, 40000)
+PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-26, -0x1p3, 40000)
+PL_TEST_INTERVAL (V_NAME (erfcf), 0, inf, 40000)
+#endif
diff --git a/pl/math/v_erff_1u5.c b/pl/math/v_erff_1u5.c
new file mode 100644
index 0000000..3a25cc8
--- /dev/null
+++ b/pl/math/v_erff_1u5.c
@@ -0,0 +1,116 @@
+/*
+ * Single-precision vector erf(x) function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "include/mathlib.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+VPCS_ATTR v_f32_t V_NAME (expf) (v_f32_t);
+
+#define AbsMask v_u32 (0x7fffffff)
+
+/* Special cases (fall back to scalar calls).  */
+VPCS_ATTR
+NOINLINE static v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+  return v_call_f32 (erff, x, y, cmp);
+}
+
+/* A structure to perform look-up in coeffs and other parameter tables.  */
+struct entry
+{
+  v_f32_t P[V_ERFF_NCOEFFS];
+};
+
+static inline struct entry
+lookup (v_u32_t i)
+{
+  struct entry e;
+#ifdef SCALAR
+  for (int j = 0; j < V_ERFF_NCOEFFS; ++j)
+    e.P[j] = __v_erff_data.coeffs[j][i];
+#else
+  for (int j = 0; j < V_ERFF_NCOEFFS; ++j)
+    {
+      e.P[j][0] = __v_erff_data.coeffs[j][i[0]];
+      e.P[j][1] = __v_erff_data.coeffs[j][i[1]];
+      e.P[j][2] = __v_erff_data.coeffs[j][i[2]];
+      e.P[j][3] = __v_erff_data.coeffs[j][i[3]];
+    }
+#endif
+  return e;
+}
+
+/* Optimized single precision vector error function erf.
+   Maximum measured at +/- 0.931, 1.25ULP:
+   v_erff(-0x1.dc59fap-1) got -0x1.9f9c88p-1
+			 want -0x1.9f9c8ap-1.  */
+VPCS_ATTR
+v_f32_t V_NAME (erff) (v_f32_t x)
+{
+  /* Handle both inf/nan as well as small values (|x|<2^-28). If any condition
+     in the lane is true then a loop over scalar calls will be performed.  */
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t atop = (ix >> 16) & v_u32 (0x7fff);
+  v_u32_t cmp = v_cond_u32 (atop - v_u32 (0x3180) >= v_u32 (0x7ff0 - 0x3180));
+
+  /* Get sign and absolute value.  */
+  v_u32_t sign = ix & ~AbsMask;
+  /* |x| < 0.921875.  */
+  v_u32_t red = v_calt_f32 (x, v_f32 (0.921875f));
+  /* |x| > 4.0.  */
+  v_u32_t bor = v_cagt_f32 (x, v_f32 (4.0f));
+  /* Avoid dependency in abs(x) in division (and comparison).  */
+  v_u32_t i = v_sel_u32 (red, v_u32 (0), v_u32 (1));
+
+  /* Get polynomial coefficients.  */
+  struct entry dat = lookup (i);
+
+  v_f32_t a = v_abs_f32 (x);
+  v_f32_t z = v_sel_f32 (red, x * x, a);
+
+  /* Evaluate Polynomial of |x| or x^2.  */
+  v_f32_t r = dat.P[6];
+  r = v_fma_f32 (z, r, dat.P[5]);
+  r = v_fma_f32 (z, r, dat.P[4]);
+  r = v_fma_f32 (z, r, dat.P[3]);
+  r = v_fma_f32 (z, r, dat.P[2]);
+  r = v_fma_f32 (z, r, dat.P[1]);
+  r = v_sel_f32 (red, r, v_fma_f32 (z, r, dat.P[0]));
+  r = v_fma_f32 (a, r, a);
+
+  /* y = |x| + |x|*P(|x|)        if |x| < 0.921875
+     1 - exp (-(|x|+|x|*P(x^2))) otherwise.  */
+  v_f32_t y = v_sel_f32 (red, r, v_f32 (1.0f) - V_NAME (expf) (-r));
+
+  /* Boring domain (absolute value is required to get the sign of erf(-nan)
+     right).  */
+  y = v_sel_f32 (bor, v_f32 (1.0f), v_abs_f32 (y));
+
+  /* y=erf(x) if x>0, -erf(-x) otherwise.  */
+  y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign);
+
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, erf, -4.0, 4.0)
+PL_TEST_ULP (V_NAME (erff), 0.76)
+PL_TEST_INTERVAL (V_NAME (erff), 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (V_NAME (erff), 0x1p-127, 0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME (erff), -0x1p-127, -0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME (erff), 0x1p-26, 0x1p3, 40000)
+PL_TEST_INTERVAL (V_NAME (erff), -0x1p-26, -0x1p3, 40000)
+PL_TEST_INTERVAL (V_NAME (erff), 0, inf, 40000)
+#endif
diff --git a/pl/math/v_erff_data.c b/pl/math/v_erff_data.c
new file mode 100644
index 0000000..73ccb5c
--- /dev/null
+++ b/pl/math/v_erff_data.c
@@ -0,0 +1,18 @@
+/*
+ * Data for approximation of vector erff.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Minimax approximation of erff.  */
+const struct v_erff_data __v_erff_data
+  = {.coeffs = {{0x0p0f, 0x1.079d0cp-3f},
+		{0x1.06eba6p-03f, 0x1.450aa0p-1},
+		{-0x1.8126e0p-02f, 0x1.b55cb0p-4f},
+		{0x1.ce1a46p-04f, -0x1.8d6300p-6f},
+		{-0x1.b68bd2p-06f, 0x1.fd1336p-9f},
+		{0x1.473f48p-08f, -0x1.91d2ccp-12f},
+		{-0x1.3a1a82p-11f, 0x1.222900p-16f}}};
diff --git a/pl/math/v_exp_tail.c b/pl/math/v_exp_tail.c
new file mode 100644
index 0000000..fd38aa8
--- /dev/null
+++ b/pl/math/v_exp_tail.c
@@ -0,0 +1,75 @@
+/*
+ * Double-precision vector e^(x+tail) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "math_config.h"
+#if V_SUPPORTED
+#include "v_exp_tail.h"
+
+#define C1 v_f64 (C1_scal)
+#define C2 v_f64 (C2_scal)
+#define C3 v_f64 (C3_scal)
+#define InvLn2 v_f64 (InvLn2_scal)
+#define Ln2hi v_f64 (Ln2hi_scal)
+#define Ln2lo v_f64 (Ln2lo_scal)
+
+#define IndexMask v_u64 (IndexMask_scal)
+#define Shift v_f64 (Shift_scal)
+#define Thres v_f64 (Thres_scal)
+
+VPCS_ATTR
+static v_f64_t
+specialcase (v_f64_t s, v_f64_t y, v_f64_t n)
+{
+  v_f64_t absn = v_abs_f64 (n);
+
+  /* 2^(n/N) may overflow, break it up into s1*s2.  */
+  v_u64_t b = v_cond_u64 (n <= v_f64 (0.0)) & v_u64 (0x6000000000000000);
+  v_f64_t s1 = v_as_f64_u64 (v_u64 (0x7000000000000000) - b);
+  v_f64_t s2 = v_as_f64_u64 (v_as_u64_f64 (s) - v_u64 (0x3010000000000000) + b);
+  v_u64_t cmp = v_cond_u64 (absn > v_f64 (1280.0 * N));
+  v_f64_t r1 = s1 * s1;
+  v_f64_t r0 = v_fma_f64 (y, s2, s2) * s1;
+  return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0)));
+}
+
+VPCS_ATTR
+v_f64_t V_NAME (exp_tail) (v_f64_t x, v_f64_t xtail)
+{
+  v_f64_t n, r, s, y, z;
+  v_u64_t cmp, u, e, i;
+
+  cmp = v_cond_u64 (v_abs_f64 (x) > Thres);
+
+  /* n = round(x/(ln2/N)).  */
+  z = v_fma_f64 (x, InvLn2, Shift);
+  u = v_as_u64_f64 (z);
+  n = z - Shift;
+
+  /* r = x - n*ln2/N.  */
+  r = x;
+  r = v_fma_f64 (-Ln2hi, n, r);
+  r = v_fma_f64 (-Ln2lo, n, r);
+
+  e = u << (52 - V_EXP_TAIL_TABLE_BITS);
+  i = u & IndexMask;
+
+  /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4.  */
+  y = v_fma_f64 (C3, r, C2);
+  y = v_fma_f64 (y, r, C1);
+  y = v_fma_f64 (y, r, v_f64 (1.0));
+  y = v_fma_f64 (y, r, xtail);
+
+  /* s = 2^(n/N).  */
+  u = v_lookup_u64 (Tab, i);
+  s = v_as_f64_u64 (u + e);
+
+  if (unlikely (v_any_u64 (cmp)))
+    return specialcase (s, y, n);
+  return v_fma_f64 (y, s, s);
+}
+#endif
diff --git a/pl/math/v_exp_tail.h b/pl/math/v_exp_tail.h
new file mode 100644
index 0000000..903f1fd
--- /dev/null
+++ b/pl/math/v_exp_tail.h
@@ -0,0 +1,21 @@
+/*
+ * Constants for double-precision e^(x+tail) vector function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define C1_scal 0x1.fffffffffffd4p-2
+#define C2_scal 0x1.5555571d6b68cp-3
+#define C3_scal 0x1.5555576a59599p-5
+#define InvLn2_scal 0x1.71547652b82fep8 /* N/ln2.  */
+#define Ln2hi_scal 0x1.62e42fefa39efp-9 /* ln2/N.  */
+#define Ln2lo_scal 0x1.abc9e3b39803f3p-64
+
+#define N (1 << V_EXP_TAIL_TABLE_BITS)
+#define Tab __v_exp_tail_data
+#define IndexMask_scal (N - 1)
+#define Shift_scal 0x1.8p+52
+#define Thres_scal 704.0
diff --git a/pl/math/v_exp_tail_data.c b/pl/math/v_exp_tail_data.c
new file mode 100644
index 0000000..675eb76
--- /dev/null
+++ b/pl/math/v_exp_tail_data.c
@@ -0,0 +1,97 @@
+/*
+ * Lookup table for double-precision e^(x+tail) vector function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* 2^(j/N), j=0..N (where N = 256).  */
+const uint64_t __v_exp_tail_data[]
+  = {0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335,
+     0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc,
+     0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574,
+     0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836,
+     0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383,
+     0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85,
+     0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2,
+     0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e,
+     0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc,
+     0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e,
+     0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b,
+     0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f,
+     0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4,
+     0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027,
+     0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6,
+     0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1,
+     0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f,
+     0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29,
+     0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1,
+     0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f,
+     0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56,
+     0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd,
+     0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff,
+     0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b,
+     0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866,
+     0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4,
+     0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422,
+     0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024,
+     0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897,
+     0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232,
+     0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0,
+     0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7,
+     0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d,
+     0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee,
+     0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82,
+     0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2,
+     0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd,
+     0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03,
+     0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148,
+     0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4,
+     0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320,
+     0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6,
+     0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd,
+     0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645,
+     0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484,
+     0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a,
+     0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9,
+     0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6,
+     0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132,
+     0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491,
+     0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13,
+     0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21,
+     0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699,
+     0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778,
+     0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736,
+     0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2,
+     0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f,
+     0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2,
+     0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090,
+     0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e,
+     0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33,
+     0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052,
+     0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf,
+     0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774,
+     0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666,
+     0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1,
+     0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47,
+     0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f,
+     0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09,
+     0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c,
+     0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b,
+     0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db,
+     0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa,
+     0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968,
+     0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487,
+     0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075,
+     0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460,
+     0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17,
+     0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6,
+     0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740,
+     0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1,
+     0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a,
+     0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540,
+     0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89,
+     0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1,
+     0x3feff9d96b2a23d9};
diff --git a/pl/math/v_expf.c b/pl/math/v_expf.c
new file mode 100644
index 0000000..a422e69
--- /dev/null
+++ b/pl/math/v_expf.c
@@ -0,0 +1,83 @@
+/*
+ * Single-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+#if V_SUPPORTED
+
+static const float Poly[] = {
+  /* maxerr: 1.45358 +0.5 ulp.  */
+  0x1.0e4020p-7f,
+  0x1.573e2ep-5f,
+  0x1.555e66p-3f,
+  0x1.fffdb6p-2f,
+  0x1.ffffecp-1f,
+};
+#define C0 v_f32 (Poly[0])
+#define C1 v_f32 (Poly[1])
+#define C2 v_f32 (Poly[2])
+#define C3 v_f32 (Poly[3])
+#define C4 v_f32 (Poly[4])
+
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define Ln2hi v_f32 (0x1.62e4p-1f)
+#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
+
+VPCS_ATTR
+static v_f32_t
+specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
+{
+  /* 2^n may overflow, break it up into s1*s2.  */
+  v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000);
+  v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
+  v_f32_t s2 = v_as_f32_u32 (e - b);
+  v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f));
+  v_u32_t r2 = v_as_u32_f32 (s1 * s1);
+  v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1);
+  /* Similar to r1 but avoids double rounding in the subnormal range.  */
+  v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale));
+  return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
+}
+
+VPCS_ATTR
+v_f32_t
+V_NAME(expf) (v_f32_t x)
+{
+  v_f32_t n, r, r2, scale, p, q, poly, absn, z;
+  v_u32_t cmp, e;
+
+  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+#if 1
+  z = v_fma_f32 (x, InvLn2, Shift);
+  n = z - Shift;
+  r = v_fma_f32 (n, -Ln2hi, x);
+  r = v_fma_f32 (n, -Ln2lo, r);
+  e = v_as_u32_f32 (z) << 23;
+#else
+  z = x * InvLn2;
+  n = v_round_f32 (z);
+  r = v_fma_f32 (n, -Ln2hi, x);
+  r = v_fma_f32 (n, -Ln2lo, r);
+  e = v_as_u32_s32 (v_round_s32 (z)) << 23;
+#endif
+  scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
+  absn = v_abs_f32 (n);
+  cmp = v_cond_u32 (absn > v_f32 (126.0f));
+  r2 = r * r;
+  p = v_fma_f32 (C0, r, C1);
+  q = v_fma_f32 (C2, r, C3);
+  q = v_fma_f32 (p, r2, q);
+  p = C4 * r;
+  poly = v_fma_f32 (q, r2, p);
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (poly, n, e, absn, cmp, scale);
+  return v_fma_f32 (poly, scale, scale);
+}
+VPCS_ALIAS
+#endif
diff --git a/pl/math/v_expm1_2u5.c b/pl/math/v_expm1_2u5.c
new file mode 100644
index 0000000..4b491d1
--- /dev/null
+++ b/pl/math/v_expm1_2u5.c
@@ -0,0 +1,113 @@
+/*
+ * Double-precision vector exp(x) - 1 function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define InvLn2 v_f64 (0x1.71547652b82fep0)
+#define MLn2hi v_f64 (-0x1.62e42fefa39efp-1)
+#define MLn2lo v_f64 (-0x1.abc9e3b39803fp-56)
+#define Shift v_f64 (0x1.8p52)
+#define TinyBound                                                              \
+  0x3cc0000000000000 /* 0x1p-51, below which expm1(x) is within 2 ULP of x. */
+#define SpecialBound                                                           \
+  0x40862b7d369a5aa9 /* 0x1.62b7d369a5aa9p+9. For |x| > SpecialBound, the      \
+			final stage of the algorithm overflows so fall back to \
+			scalar.  */
+#define AbsMask 0x7fffffffffffffff
+#define One 0x3ff0000000000000
+
+#define C(i) v_f64 (__expm1_poly[i])
+
+static inline v_f64_t
+eval_poly (v_f64_t f, v_f64_t f2)
+{
+  /* Evaluate custom polynomial using Estrin scheme.  */
+  v_f64_t p_01 = v_fma_f64 (f, C (1), C (0));
+  v_f64_t p_23 = v_fma_f64 (f, C (3), C (2));
+  v_f64_t p_45 = v_fma_f64 (f, C (5), C (4));
+  v_f64_t p_67 = v_fma_f64 (f, C (7), C (6));
+  v_f64_t p_89 = v_fma_f64 (f, C (9), C (8));
+
+  v_f64_t p_03 = v_fma_f64 (f2, p_23, p_01);
+  v_f64_t p_47 = v_fma_f64 (f2, p_67, p_45);
+  v_f64_t p_8a = v_fma_f64 (f2, C (10), p_89);
+
+  v_f64_t f4 = f2 * f2;
+  v_f64_t p_07 = v_fma_f64 (f4, p_47, p_03);
+  return v_fma_f64 (f4 * f4, p_8a, p_07);
+}
+
+/* Double-precision vector exp(x) - 1 function.
+   The maximum error observed error is 2.18 ULP:
+   __v_expm1(0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2
+				  want 0x1.a8b9ea8d66e2p-2.  */
+VPCS_ATTR
+v_f64_t V_NAME (expm1) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t ax = ix & AbsMask;
+
+#if WANT_SIMD_EXCEPT
+  /* If fp exceptions are to be triggered correctly, fall back to the scalar
+     variant for all lanes if any of them should trigger an exception.  */
+  v_u64_t special = v_cond_u64 ((ax >= SpecialBound) | (ax <= TinyBound));
+  if (unlikely (v_any_u64 (special)))
+    return v_call_f64 (expm1, x, x, v_u64 (-1));
+#else
+  /* Large input, NaNs and Infs.  */
+  v_u64_t special
+    = v_cond_u64 ((ax >= SpecialBound) | (ix == 0x8000000000000000));
+#endif
+
+  /* Reduce argument to smaller range:
+     Let i = round(x / ln2)
+     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where 2^i is exact because i is an integer.  */
+  v_f64_t j = v_fma_f64 (InvLn2, x, Shift) - Shift;
+  v_s64_t i = v_to_s64_f64 (j);
+  v_f64_t f = v_fma_f64 (j, MLn2hi, x);
+  f = v_fma_f64 (j, MLn2lo, f);
+
+  /* Approximate expm1(f) using polynomial.
+     Taylor expansion for expm1(x) has the form:
+	 x + ax^2 + bx^3 + cx^4 ....
+     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+  v_f64_t f2 = f * f;
+  v_f64_t p = v_fma_f64 (f2, eval_poly (f, f2), f);
+
+  /* Assemble the result.
+     expm1(x) ~= 2^i * (p + 1) - 1
+     Let t = 2^i.  */
+  v_f64_t t = v_as_f64_u64 (v_as_u64_s64 (i << 52) + One);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  v_f64_t y = v_fma_f64 (p, t, t - 1);
+
+#if !WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u64 (special)))
+    return v_call_f64 (expm1, x, y, special);
+#endif
+
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, expm1, -9.9, 9.9)
+PL_TEST_ULP (V_NAME (expm1), 1.68)
+PL_TEST_EXPECT_FENV (V_NAME (expm1), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (expm1), 0, 0x1p-51, 1000)
+PL_TEST_INTERVAL (V_NAME (expm1), -0, -0x1p-51, 1000)
+PL_TEST_INTERVAL (V_NAME (expm1), 0x1p-51, 0x1.63108c75a1937p+9, 100000)
+PL_TEST_INTERVAL (V_NAME (expm1), -0x1p-51, -0x1.740bf7c0d927dp+9, 100000)
+PL_TEST_INTERVAL (V_NAME (expm1), 0x1.63108c75a1937p+9, inf, 100)
+PL_TEST_INTERVAL (V_NAME (expm1), -0x1.740bf7c0d927dp+9, -inf, 100)
+#endif
diff --git a/pl/math/v_expm1f_1u6.c b/pl/math/v_expm1f_1u6.c
new file mode 100644
index 0000000..ab13242
--- /dev/null
+++ b/pl/math/v_expm1f_1u6.c
@@ -0,0 +1,94 @@
+/*
+ * Single-precision vector exp(x) - 1 function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define MLn2hi v_f32 (-0x1.62e4p-1f)
+#define MLn2lo v_f32 (-0x1.7f7d1cp-20f)
+#define AbsMask (0x7fffffff)
+#define One (0x3f800000)
+#define SpecialBound                                                           \
+  (0x42af5e20) /* asuint(0x1.5ebc4p+6). Largest value of x for which expm1(x)  \
+		  should round to -1.  */
+#define TinyBound (0x34000000) /* asuint(0x1p-23).  */
+
+#define C(i) v_f32 (__expm1f_poly[i])
+
+/* Single-precision vector exp(x) - 1 function.
+   The maximum error is 1.51 ULP:
+   expm1f(0x1.8baa96p-2) got 0x1.e2fb9p-2
+			want 0x1.e2fb94p-2.  */
+VPCS_ATTR
+v_f32_t V_NAME (expm1f) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t ax = ix & AbsMask;
+
+#if WANT_SIMD_EXCEPT
+  /* If fp exceptions are to be triggered correctly, fall back to the scalar
+     variant for all lanes if any of them should trigger an exception.  */
+  v_u32_t special
+    = v_cond_u32 ((ax >= SpecialBound) | (ix == 0x80000000) | (ax < TinyBound));
+  if (unlikely (v_any_u32 (special)))
+    return v_call_f32 (expm1f, x, x, v_u32 (0xffffffff));
+#else
+  /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf and -0.  */
+  v_u32_t special = v_cond_u32 ((ax >= SpecialBound) | (ix == 0x80000000));
+#endif
+
+  /* Reduce argument to smaller range:
+     Let i = round(x / ln2)
+     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where 2^i is exact because i is an integer.  */
+  v_f32_t j = v_fma_f32 (InvLn2, x, Shift) - Shift;
+  v_s32_t i = v_to_s32_f32 (j);
+  v_f32_t f = v_fma_f32 (j, MLn2hi, x);
+  f = v_fma_f32 (j, MLn2lo, f);
+
+  /* Approximate expm1(f) using polynomial.
+     Taylor expansion for expm1(x) has the form:
+	 x + ax^2 + bx^3 + cx^4 ....
+     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+
+  v_f32_t p = v_fma_f32 (C (4), f, C (3));
+  p = v_fma_f32 (p, f, C (2));
+  p = v_fma_f32 (p, f, C (1));
+  p = v_fma_f32 (p, f, C (0));
+  p = v_fma_f32 (f * f, p, f);
+
+  /* Assemble the result.
+     expm1(x) ~= 2^i * (p + 1) - 1
+     Let t = 2^i.  */
+  v_f32_t t = v_as_f32_u32 (v_as_u32_s32 (i << 23) + One);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  v_f32_t y = v_fma_f32 (p, t, t - 1);
+
+#if !WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u32 (special)))
+    return v_call_f32 (expm1f, x, y, special);
+#endif
+
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, expm1, -9.9, 9.9)
+PL_TEST_ULP (V_NAME (expm1f), 1.02)
+PL_TEST_EXPECT_FENV (V_NAME (expm1f), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (expm1f), 0, 0x1p-23, 1000)
+PL_TEST_INTERVAL (V_NAME (expm1f), -0, -0x1p-23, 1000)
+PL_TEST_INTERVAL (V_NAME (expm1f), 0x1p-23, 0x1.644716p6, 1000000)
+PL_TEST_INTERVAL (V_NAME (expm1f), -0x1p-23, -0x1.9bbabcp+6, 1000000)
+#endif
diff --git a/pl/math/v_expm1f_inline.h b/pl/math/v_expm1f_inline.h
new file mode 100644
index 0000000..c261941
--- /dev/null
+++ b/pl/math/v_expm1f_inline.h
@@ -0,0 +1,49 @@
+/*
+ * Helper for single-precision routines which calculate exp(x) - 1 and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_V_EXPM1F_INLINE_H
+#define PL_MATH_V_EXPM1F_INLINE_H
+
+#include "v_math.h"
+#include "math_config.h"
+#include "estrinf.h"
+
+#define One 0x3f800000
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define MLn2hi v_f32 (-0x1.62e4p-1f)
+#define MLn2lo v_f32 (-0x1.7f7d1cp-20f)
+
+#define C(i) v_f32 (__expm1f_poly[i])
+
+static inline v_f32_t
+expm1f_inline (v_f32_t x)
+{
+  /* Helper routine for calculating exp(x) - 1.
+     Copied from v_expm1f_1u6.c, with all special-case handling removed - the
+     calling routine should handle special values if required.  */
+
+  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+  v_f32_t j = v_fma_f32 (InvLn2, x, Shift) - Shift;
+  v_s32_t i = v_to_s32_f32 (j);
+  v_f32_t f = v_fma_f32 (j, MLn2hi, x);
+  f = v_fma_f32 (j, MLn2lo, f);
+
+  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
+     Uses Estrin scheme, where the main __v_expm1f routine uses Horner.  */
+  v_f32_t f2 = f * f;
+  v_f32_t p = ESTRIN_4 (f, f2, f2 * f2, C);
+  p = v_fma_f32 (f2, p, f);
+
+  /* t = 2^i.  */
+  v_f32_t t = v_as_f32_u32 (v_as_u32_s32 (i << 23) + One);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return v_fma_f32 (p, t, t - 1);
+}
+
+#endif // PL_MATH_V_EXPM1F_INLINE_H
diff --git a/pl/math/v_log10_2u5.c b/pl/math/v_log10_2u5.c
new file mode 100644
index 0000000..86d398c
--- /dev/null
+++ b/pl/math/v_log10_2u5.c
@@ -0,0 +1,110 @@
+/*
+ * Double-precision vector log10(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "include/mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define A(i) v_f64 (__v_log10_data.poly[i])
+#define T(s, i) __v_log10_data.tab[i].s
+#define Ln2 v_f64 (0x1.62e42fefa39efp-1)
+#define N (1 << V_LOG10_TABLE_BITS)
+#define OFF v_u64 (0x3fe6900900000000)
+
+struct entry
+{
+  v_f64_t invc;
+  v_f64_t log10c;
+};
+
+static inline struct entry
+lookup (v_u64_t i)
+{
+  struct entry e;
+#ifdef SCALAR
+  e.invc = T (invc, i);
+  e.log10c = T (log10c, i);
+#else
+  e.invc[0] = T (invc, i[0]);
+  e.log10c[0] = T (log10c, i[0]);
+  e.invc[1] = T (invc, i[1]);
+  e.log10c[1] = T (log10c, i[1]);
+#endif
+  return e;
+}
+
+VPCS_ATTR
+inline static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+{
+  return v_call_f64 (log10, x, y, cmp);
+}
+
+/* Our implementation of v_log10 is a slight modification of v_log (1.660ulps).
+   Max ULP error: < 2.5 ulp (nearest rounding.)
+   Maximum measured at 2.46 ulp for x in [0.96, 0.97]
+     __v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6
+				    want 0x1.fff6be3cae4b9p-6
+     -0.459999 ulp err 1.96.  */
+VPCS_ATTR
+v_f64_t V_NAME (log10) (v_f64_t x)
+{
+  v_f64_t z, r, r2, p, y, kd, hi;
+  v_u64_t ix, iz, tmp, top, i, cmp;
+  v_s64_t k;
+  struct entry e;
+
+  ix = v_as_u64_f64 (x);
+  top = ix >> 48;
+  cmp = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010));
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  tmp = ix - OFF;
+  i = (tmp >> (52 - V_LOG10_TABLE_BITS)) % N;
+  k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift.  */
+  iz = ix - (tmp & v_u64 (0xfffULL << 52));
+  z = v_as_f64_u64 (iz);
+  e = lookup (i);
+
+  /* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2).  */
+  r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
+  kd = v_to_f64_s64 (k);
+
+  /* hi = r / log(10) + log10(c) + k*log10(2).
+     Constants in `v_log10_data.c` are computed (in extended precision) as
+     e.log10c := e.logc * ivln10.  */
+  v_f64_t w = v_fma_f64 (r, v_f64 (__v_log10_data.invln10), e.log10c);
+
+  /* y = log10(1+r) + n * log10(2).  */
+  hi = v_fma_f64 (kd, v_f64 (__v_log10_data.log10_2), w);
+
+  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+  r2 = r * r;
+  y = v_fma_f64 (A (3), r, A (2));
+  p = v_fma_f64 (A (1), r, A (0));
+  y = v_fma_f64 (A (4), r2, y);
+  y = v_fma_f64 (y, r2, p);
+  y = v_fma_f64 (y, r2, hi);
+
+  if (unlikely (v_any_u64 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, log10, 0.01, 11.1)
+PL_TEST_ULP (V_NAME (log10), 1.97)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log10))
+PL_TEST_INTERVAL (V_NAME (log10), 0, 0xffff000000000000, 10000)
+PL_TEST_INTERVAL (V_NAME (log10), 0x1p-4, 0x1p4, 400000)
+PL_TEST_INTERVAL (V_NAME (log10), 0, inf, 400000)
+#endif
diff --git a/pl/math/v_log10_data.c b/pl/math/v_log10_data.c
new file mode 100644
index 0000000..fda85c8
--- /dev/null
+++ b/pl/math/v_log10_data.c
@@ -0,0 +1,167 @@
+/*
+ * Lookup table for double-precision log10(x) vector function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define N (1 << V_LOG10_TABLE_BITS)
+
+/* Algorithm:
+
+	x = 2^k z
+	log10(x) = k log10(2) + log10(c) + poly(z/c - 1) / log(10)
+
+where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,N=128)
+and log(c) and 1/c for the ith subinterval comes from a lookup table:
+
+	tab[i].invc = 1/c
+	tab[i].log10c = (double)log10(c)
+
+where c is near the center of the subinterval and is chosen by trying several
+floating point invc candidates around 1/center and selecting one for which
+the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval
+that contains 1 and the previous one got tweaked to avoid cancellation.
+NB: invc should be optimized to minimize error in (double)log10(c) instead.  */
+const struct v_log10_data __v_log10_data
+  = {.tab = {{0x1.6a133d0dec120p+0, -0x1.345825f221684p-3},
+	     {0x1.6815f2f3e42edp+0, -0x1.2f71a1f0c554ep-3},
+	     {0x1.661e39be1ac9ep+0, -0x1.2a91fdb30b1f4p-3},
+	     {0x1.642bfa30ac371p+0, -0x1.25b9260981a04p-3},
+	     {0x1.623f1d916f323p+0, -0x1.20e7081762193p-3},
+	     {0x1.60578da220f65p+0, -0x1.1c1b914aeefacp-3},
+	     {0x1.5e75349dea571p+0, -0x1.1756af5de404dp-3},
+	     {0x1.5c97fd387a75ap+0, -0x1.12985059c90bfp-3},
+	     {0x1.5abfd2981f200p+0, -0x1.0de0628f63df4p-3},
+	     {0x1.58eca051dc99cp+0, -0x1.092ed492e08eep-3},
+	     {0x1.571e526d9df12p+0, -0x1.0483954caf1dfp-3},
+	     {0x1.5554d555b3fcbp+0, -0x1.ffbd27a9adbcp-4},
+	     {0x1.539015e2a20cdp+0, -0x1.f67f7f2e3d1ap-4},
+	     {0x1.51d0014ee0164p+0, -0x1.ed4e1071ceebep-4},
+	     {0x1.50148538cd9eep+0, -0x1.e428bb47413c4p-4},
+	     {0x1.4e5d8f9f698a1p+0, -0x1.db0f6003028d6p-4},
+	     {0x1.4cab0edca66bep+0, -0x1.d201df6749831p-4},
+	     {0x1.4afcf1a9db874p+0, -0x1.c9001ac5c9672p-4},
+	     {0x1.495327136e16fp+0, -0x1.c009f3c78c79p-4},
+	     {0x1.47ad9e84af28fp+0, -0x1.b71f4cb642e53p-4},
+	     {0x1.460c47b39ae15p+0, -0x1.ae400818526b2p-4},
+	     {0x1.446f12b278001p+0, -0x1.a56c091954f87p-4},
+	     {0x1.42d5efdd720ecp+0, -0x1.9ca3332f096eep-4},
+	     {0x1.4140cfe001a0fp+0, -0x1.93e56a3f23e55p-4},
+	     {0x1.3fafa3b421f69p+0, -0x1.8b3292a3903bp-4},
+	     {0x1.3e225c9c8ece5p+0, -0x1.828a9112d9618p-4},
+	     {0x1.3c98ec29a211ap+0, -0x1.79ed4ac35f5acp-4},
+	     {0x1.3b13442a413fep+0, -0x1.715aa51ed28c4p-4},
+	     {0x1.399156baa3c54p+0, -0x1.68d2861c999e9p-4},
+	     {0x1.38131639b4cdbp+0, -0x1.6054d40ded21p-4},
+	     {0x1.36987540fbf53p+0, -0x1.57e17576bc9a2p-4},
+	     {0x1.352166b648f61p+0, -0x1.4f7851798bb0bp-4},
+	     {0x1.33adddb3eb575p+0, -0x1.47194f5690ae3p-4},
+	     {0x1.323dcd99fc1d3p+0, -0x1.3ec456d58ec47p-4},
+	     {0x1.30d129fefc7d2p+0, -0x1.36794ff3e5f55p-4},
+	     {0x1.2f67e6b72fe7dp+0, -0x1.2e382315725e4p-4},
+	     {0x1.2e01f7cf8b187p+0, -0x1.2600b8ed82e91p-4},
+	     {0x1.2c9f518ddc86ep+0, -0x1.1dd2fa85efc12p-4},
+	     {0x1.2b3fe86e5f413p+0, -0x1.15aed136e3961p-4},
+	     {0x1.29e3b1211b25cp+0, -0x1.0d94269d1a30dp-4},
+	     {0x1.288aa08b373cfp+0, -0x1.0582e4a7659f5p-4},
+	     {0x1.2734abcaa8467p+0, -0x1.faf5eb655742dp-5},
+	     {0x1.25e1c82459b81p+0, -0x1.eaf888487e8eep-5},
+	     {0x1.2491eb1ad59c5p+0, -0x1.db0d75ef25a82p-5},
+	     {0x1.23450a54048b5p+0, -0x1.cb348a49e6431p-5},
+	     {0x1.21fb1bb09e578p+0, -0x1.bb6d9c69acdd8p-5},
+	     {0x1.20b415346d8f7p+0, -0x1.abb88368aa7ap-5},
+	     {0x1.1f6fed179a1acp+0, -0x1.9c1517476af14p-5},
+	     {0x1.1e2e99b93c7b3p+0, -0x1.8c833051bfa4dp-5},
+	     {0x1.1cf011a7a882ap+0, -0x1.7d02a78e7fb31p-5},
+	     {0x1.1bb44b97dba5ap+0, -0x1.6d93565e97c5fp-5},
+	     {0x1.1a7b3e66cdd4fp+0, -0x1.5e351695db0c5p-5},
+	     {0x1.1944e11dc56cdp+0, -0x1.4ee7c2ba67adcp-5},
+	     {0x1.18112aebb1a6ep+0, -0x1.3fab35ba16c01p-5},
+	     {0x1.16e013231b7e9p+0, -0x1.307f4ad854bc9p-5},
+	     {0x1.15b1913f156cfp+0, -0x1.2163ddf4f988cp-5},
+	     {0x1.14859cdedde13p+0, -0x1.1258cb5d19e22p-5},
+	     {0x1.135c2dc68cfa4p+0, -0x1.035defdba3188p-5},
+	     {0x1.12353bdb01684p+0, -0x1.e8e651191bce4p-6},
+	     {0x1.1110bf25b85b4p+0, -0x1.cb30a62be444cp-6},
+	     {0x1.0feeafd2f8577p+0, -0x1.ad9a9b3043823p-6},
+	     {0x1.0ecf062c51c3bp+0, -0x1.9023ecda1ccdep-6},
+	     {0x1.0db1baa076c8bp+0, -0x1.72cc592bd82dp-6},
+	     {0x1.0c96c5bb3048ep+0, -0x1.55939eb1f9c6ep-6},
+	     {0x1.0b7e20263e070p+0, -0x1.38797ca6cc5ap-6},
+	     {0x1.0a67c2acd0ce3p+0, -0x1.1b7db35c2c072p-6},
+	     {0x1.0953a6391e982p+0, -0x1.fd400812ee9a2p-7},
+	     {0x1.0841c3caea380p+0, -0x1.c3c05fb4620f1p-7},
+	     {0x1.07321489b13eap+0, -0x1.8a7bf3c40e2e3p-7},
+	     {0x1.062491aee9904p+0, -0x1.517249c15a75cp-7},
+	     {0x1.05193497a7cc5p+0, -0x1.18a2ea5330c91p-7},
+	     {0x1.040ff6b5f5e9fp+0, -0x1.c01abc8cdc4e2p-8},
+	     {0x1.0308d19aa6127p+0, -0x1.4f6261750dec9p-8},
+	     {0x1.0203beedb0c67p+0, -0x1.be37b6612afa7p-9},
+	     {0x1.010037d38bcc2p+0, -0x1.bc3a8398ac26p-10},
+	     {1.0, 0.0},
+	     {0x1.fc06d493cca10p-1, 0x1.bb796219f30a5p-9},
+	     {0x1.f81e6ac3b918fp-1, 0x1.b984fdcba61cep-8},
+	     {0x1.f44546ef18996p-1, 0x1.49cf12adf8e8cp-7},
+	     {0x1.f07b10382c84bp-1, 0x1.b6075b5217083p-7},
+	     {0x1.ecbf7070e59d4p-1, 0x1.10b7466fc30ddp-6},
+	     {0x1.e91213f715939p-1, 0x1.4603e4db6a3a1p-6},
+	     {0x1.e572a9a75f7b7p-1, 0x1.7aeb10e99e105p-6},
+	     {0x1.e1e0e2c530207p-1, 0x1.af6e49b0f0e36p-6},
+	     {0x1.de5c72d8a8be3p-1, 0x1.e38f064f41179p-6},
+	     {0x1.dae50fa5658ccp-1, 0x1.0ba75abbb7623p-5},
+	     {0x1.d77a71145a2dap-1, 0x1.25575ee2dba86p-5},
+	     {0x1.d41c51166623ep-1, 0x1.3ed83f477f946p-5},
+	     {0x1.d0ca6ba0bb29fp-1, 0x1.582aa79af60efp-5},
+	     {0x1.cd847e8e59681p-1, 0x1.714f400fa83aep-5},
+	     {0x1.ca4a499693e00p-1, 0x1.8a46ad3901cb9p-5},
+	     {0x1.c71b8e399e821p-1, 0x1.a311903b6b87p-5},
+	     {0x1.c3f80faf19077p-1, 0x1.bbb086f216911p-5},
+	     {0x1.c0df92dc2b0ecp-1, 0x1.d4242bdda648ep-5},
+	     {0x1.bdd1de3cbb542p-1, 0x1.ec6d167c2af1p-5},
+	     {0x1.baceb9e1007a3p-1, 0x1.0245ed8221426p-4},
+	     {0x1.b7d5ef543e55ep-1, 0x1.0e40856c74f64p-4},
+	     {0x1.b4e749977d953p-1, 0x1.1a269a31120fep-4},
+	     {0x1.b20295155478ep-1, 0x1.25f8718fc076cp-4},
+	     {0x1.af279f8e82be2p-1, 0x1.31b64ffc95bfp-4},
+	     {0x1.ac5638197fdf3p-1, 0x1.3d60787ca5063p-4},
+	     {0x1.a98e2f102e087p-1, 0x1.48f72ccd187fdp-4},
+	     {0x1.a6cf5606d05c1p-1, 0x1.547aad6602f1cp-4},
+	     {0x1.a4197fc04d746p-1, 0x1.5feb3989d3acbp-4},
+	     {0x1.a16c80293dc01p-1, 0x1.6b490f3978c79p-4},
+	     {0x1.9ec82c4dc5bc9p-1, 0x1.76946b3f5e703p-4},
+	     {0x1.9c2c5a491f534p-1, 0x1.81cd895717c83p-4},
+	     {0x1.9998e1480b618p-1, 0x1.8cf4a4055c30ep-4},
+	     {0x1.970d9977c6c2dp-1, 0x1.9809f4c48c0ebp-4},
+	     {0x1.948a5c023d212p-1, 0x1.a30db3f9899efp-4},
+	     {0x1.920f0303d6809p-1, 0x1.ae001905458fcp-4},
+	     {0x1.8f9b698a98b45p-1, 0x1.b8e15a2e3a2cdp-4},
+	     {0x1.8d2f6b81726f6p-1, 0x1.c3b1ace2b0996p-4},
+	     {0x1.8acae5bb55badp-1, 0x1.ce71456edfa62p-4},
+	     {0x1.886db5d9275b8p-1, 0x1.d9205759882c4p-4},
+	     {0x1.8617ba567c13cp-1, 0x1.e3bf1513af0dfp-4},
+	     {0x1.83c8d27487800p-1, 0x1.ee4db0412c414p-4},
+	     {0x1.8180de3c5dbe7p-1, 0x1.f8cc5998de3a5p-4},
+	     {0x1.7f3fbe71cdb71p-1, 0x1.019da085eaeb1p-3},
+	     {0x1.7d055498071c1p-1, 0x1.06cd4acdb4e3dp-3},
+	     {0x1.7ad182e54f65ap-1, 0x1.0bf542bef813fp-3},
+	     {0x1.78a42c3c90125p-1, 0x1.11159f14da262p-3},
+	     {0x1.767d342f76944p-1, 0x1.162e761c10d1cp-3},
+	     {0x1.745c7ef26b00ap-1, 0x1.1b3fddc60d43ep-3},
+	     {0x1.7241f15769d0fp-1, 0x1.2049ebac86aa6p-3},
+	     {0x1.702d70d396e41p-1, 0x1.254cb4fb7836ap-3},
+	     {0x1.6e1ee3700cd11p-1, 0x1.2a484e8d0d252p-3},
+	     {0x1.6c162fc9cbe02p-1, 0x1.2f3ccce1c860bp-3}},
+
+     /* Computed from log coeffs div by log(10) then rounded to double
+	precision.  */
+     .poly
+     = {-0x1.bcb7b1526e506p-3, 0x1.287a7636be1d1p-3, -0x1.bcb7b158af938p-4,
+	0x1.63c78734e6d07p-4, -0x1.287461742fee4p-4},
+
+     .invln10 = 0x1.bcb7b1526e50ep-2,
+     .log10_2 = 0x1.34413509f79ffp-2
+
+};
diff --git a/pl/math/v_log10f_3u5.c b/pl/math/v_log10f_3u5.c
new file mode 100644
index 0000000..e9f7f03
--- /dev/null
+++ b/pl/math/v_log10f_3u5.c
@@ -0,0 +1,82 @@
+/*
+ * Single-precision vector log10 function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define P(i) v_f32 (__v_log10f_poly[i])
+
+#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218.  */
+#define InvLn10 v_f32 (0x1.bcb7b2p-2f)
+#define Min v_u32 (0x00800000)
+#define Max v_u32 (0x7f800000)
+#define Mask v_u32 (0x007fffff)
+#define Off v_u32 (0x3f2aaaab) /* 0.666667.  */
+
+VPCS_ATTR
+NOINLINE static v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+  /* Fall back to scalar code.  */
+  return v_call_f32 (log10f, x, y, cmp);
+}
+
+/* Our fast implementation of v_log10f uses a similar approach as v_logf.
+   With the same offset as v_logf (i.e., 2/3) it delivers about 3.3ulps with
+   order 9. This is more efficient than using a low order polynomial computed in
+   double precision.
+   Maximum error: 3.305ulps (nearest rounding.)
+   __v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4
+			    want 0x1.ffe2f4p-4 -0.304916 ulp err 2.80492.  */
+VPCS_ATTR
+v_f32_t V_NAME (log10f) (v_f32_t x)
+{
+  v_f32_t n, o, p, q, r, r2, y;
+  v_u32_t u, cmp;
+
+  u = v_as_u32_f32 (x);
+  cmp = v_cond_u32 (u - Min >= Max - Min);
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  u -= Off;
+  n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend.  */
+  u &= Mask;
+  u += Off;
+  r = v_as_f32_u32 (u) - v_f32 (1.0f);
+
+  /* y = log10(1+r) + n*log10(2).  */
+  r2 = r * r;
+  /* (n*ln2 + r)*InvLn10 + r2*(P0 + r*P1 + r2*(P2 + r*P3 + r2*(P4 + r*P5 +
+     r2*(P6+r*P7))).  */
+  o = v_fma_f32 (P (7), r, P (6));
+  p = v_fma_f32 (P (5), r, P (4));
+  q = v_fma_f32 (P (3), r, P (2));
+  y = v_fma_f32 (P (1), r, P (0));
+  p = v_fma_f32 (o, r2, p);
+  q = v_fma_f32 (p, r2, q);
+  y = v_fma_f32 (q, r2, y);
+  /* Using p = Log10(2)*n + r*InvLn(10) is slightly faster
+     but less accurate.  */
+  p = v_fma_f32 (Ln2, n, r);
+  y = v_fma_f32 (y, r2, p * InvLn10);
+
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, log10, 0.01, 11.1)
+PL_TEST_ULP (V_NAME (log10f), 2.81)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log10f))
+PL_TEST_INTERVAL (V_NAME (log10f), 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (V_NAME (log10f), 0x1p-4, 0x1p4, 500000)
+#endif
diff --git a/pl/math/v_log10f_data.c b/pl/math/v_log10f_data.c
new file mode 100644
index 0000000..537482a
--- /dev/null
+++ b/pl/math/v_log10f_data.c
@@ -0,0 +1,13 @@
+/*
+ * Coefficients for single-precision vector log10 function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "math_config.h"
+
+const float __v_log10f_poly[] = {
+  /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
+     [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25.  */
+  -0x1.bcb79cp-3f, 0x1.2879c8p-3f, -0x1.bcd472p-4f, 0x1.6408f8p-4f,
+  -0x1.246f8p-4f,  0x1.f0e514p-5f, -0x1.0fc92cp-4f, 0x1.f5f76ap-5f};
diff --git a/pl/math/v_log1p_2u5.c b/pl/math/v_log1p_2u5.c
new file mode 100644
index 0000000..e482910
--- /dev/null
+++ b/pl/math/v_log1p_2u5.c
@@ -0,0 +1,120 @@
+/*
+ * Double-precision vector log(1+x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "estrin.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define Ln2Hi v_f64 (0x1.62e42fefa3800p-1)
+#define Ln2Lo v_f64 (0x1.ef35793c76730p-45)
+#define HfRt2Top 0x3fe6a09e00000000 /* top32(asuint64(sqrt(2)/2)) << 32.  */
+#define OneMHfRt2Top                                                           \
+  0x00095f6200000000 /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2)))      \
+			<< 32.  */
+#define OneTop12 0x3ff
+#define BottomMask 0xffffffff
+#define AbsMask 0x7fffffffffffffff
+#define C(i) v_f64 (__log1p_data.coeffs[i])
+
+static inline v_f64_t
+eval_poly (v_f64_t f)
+{
+  v_f64_t f2 = f * f;
+  v_f64_t f4 = f2 * f2;
+  v_f64_t f8 = f4 * f4;
+  return ESTRIN_18 (f, f2, f4, f8, f8 * f8, C);
+}
+
+VPCS_ATTR
+NOINLINE static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t special)
+{
+  return v_call_f64 (log1p, x, y, special);
+}
+
+/* Vector log1p approximation using polynomial on reduced interval. Routine is a
+   modification of the algorithm used in scalar log1p, with no shortcut for k=0
+   and no narrowing for f and k. Maximum observed error is 2.46 ULP:
+    __v_log1p(0x1.654a1307242a4p+11) got 0x1.fd5565fb590f4p+2
+				    want 0x1.fd5565fb590f6p+2 .  */
+VPCS_ATTR v_f64_t V_NAME (log1p) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t ia = ix & AbsMask;
+  v_u64_t special
+    = v_cond_u64 ((ia >= v_u64 (0x7ff0000000000000))
+		  | (ix >= 0xbff0000000000000) | (ix == 0x8000000000000000));
+
+#if WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u64 (special)))
+    x = v_sel_f64 (special, v_f64 (0), x);
+#endif
+
+  /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f
+			   is in [sqrt(2)/2, sqrt(2)]):
+     log1p(x) = k*log(2) + log1p(f).
+
+     f may not be representable exactly, so we need a correction term:
+     let m = round(1 + x), c = (1 + x) - m.
+     c << m: at very small x, log1p(x) ~ x, hence:
+     log(1+x) - log(m) ~ c/m.
+
+     We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m.  */
+
+  /* Obtain correctly scaled k by manipulation in the exponent.
+     The scalar algorithm casts down to 32-bit at this point to calculate k and
+     u_red. We stay in double-width to obtain f and k, using the same constants
+     as the scalar algorithm but shifted left by 32.  */
+  v_f64_t m = x + 1;
+  v_u64_t mi = v_as_u64_f64 (m);
+  v_u64_t u = mi + OneMHfRt2Top;
+
+  v_s64_t ki = v_as_s64_u64 (u >> 52) - OneTop12;
+  v_f64_t k = v_to_f64_s64 (ki);
+
+  /* Reduce x to f in [sqrt(2)/2, sqrt(2)].  */
+  v_u64_t utop = (u & 0x000fffff00000000) + HfRt2Top;
+  v_u64_t u_red = utop | (mi & BottomMask);
+  v_f64_t f = v_as_f64_u64 (u_red) - 1;
+
+  /* Correction term c/m.  */
+  v_f64_t cm = (x - (m - 1)) / m;
+
+  /* Approximate log1p(x) on the reduced input using a polynomial. Because
+   log1p(0)=0 we choose an approximation of the form:
+      x + C0*x^2 + C1*x^3 + C2x^4 + ...
+   Hence approximation has the form f + f^2 * P(f)
+      where P(x) = C0 + C1*x + C2x^2 + ...
+   Assembling this all correctly is dealt with at the final step.  */
+  v_f64_t p = eval_poly (f);
+
+  v_f64_t ylo = v_fma_f64 (k, Ln2Lo, cm);
+  v_f64_t yhi = v_fma_f64 (k, Ln2Hi, f);
+  v_f64_t y = v_fma_f64 (f * f, p, ylo + yhi);
+
+  if (unlikely (v_any_u64 (special)))
+    return specialcase (v_as_f64_u64 (ix), y, special);
+
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, log1p, -0.9, 10.0)
+PL_TEST_ULP (V_NAME (log1p), 1.97)
+PL_TEST_EXPECT_FENV (V_NAME (log1p), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (log1p), -10.0, 10.0, 10000)
+PL_TEST_INTERVAL (V_NAME (log1p), 0.0, 0x1p-23, 50000)
+PL_TEST_INTERVAL (V_NAME (log1p), 0x1p-23, 0.001, 50000)
+PL_TEST_INTERVAL (V_NAME (log1p), 0.001, 1.0, 50000)
+PL_TEST_INTERVAL (V_NAME (log1p), 0.0, -0x1p-23, 50000)
+PL_TEST_INTERVAL (V_NAME (log1p), -0x1p-23, -0.001, 50000)
+PL_TEST_INTERVAL (V_NAME (log1p), -0.001, -1.0, 50000)
+PL_TEST_INTERVAL (V_NAME (log1p), -1.0, inf, 5000)
+#endif
diff --git a/pl/math/v_log1p_inline.h b/pl/math/v_log1p_inline.h
new file mode 100644
index 0000000..e5c7339
--- /dev/null
+++ b/pl/math/v_log1p_inline.h
@@ -0,0 +1,77 @@
+/*
+ * Helper for vector double-precision routines which calculate log(1 + x) and do
+ * not need special-case handling
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#ifndef PL_MATH_V_LOG1P_INLINE_H
+#define PL_MATH_V_LOG1P_INLINE_H
+
+#include "v_math.h"
+#include "pairwise_horner.h"
+
+#define Ln2Hi v_f64 (0x1.62e42fefa3800p-1)
+#define Ln2Lo v_f64 (0x1.ef35793c76730p-45)
+#define HfRt2Top 0x3fe6a09e00000000 /* top32(asuint64(sqrt(2)/2)) << 32.  */
+#define OneMHfRt2Top                                                           \
+  0x00095f6200000000 /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2)))      \
+			<< 32.  */
+#define OneTop 0x3ff
+#define BottomMask 0xffffffff
+#define BigBoundTop 0x5fe /* top12 (asuint64 (0x1p511)).  */
+
+#define C(i) v_f64 (__log1p_data.coeffs[i])
+
+static inline v_f64_t
+log1p_inline (v_f64_t x)
+{
+  /* Helper for calculating log(x + 1). Copied from v_log1p_2u5.c, with several
+     modifications:
+     - No special-case handling - this should be dealt with by the caller.
+     - Pairwise Horner polynomial evaluation for improved accuracy.
+     - Optionally simulate the shortcut for k=0, used in the scalar routine,
+       using v_sel, for improved accuracy when the argument to log1p is close to
+       0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1 in
+       the source of the caller before including this file.
+     See v_log1pf_2u1.c for details of the algorithm.  */
+  v_f64_t m = x + 1;
+  v_u64_t mi = v_as_u64_f64 (m);
+  v_u64_t u = mi + OneMHfRt2Top;
+
+  v_s64_t ki = v_as_s64_u64 (u >> 52) - OneTop;
+  v_f64_t k = v_to_f64_s64 (ki);
+
+  /* Reduce x to f in [sqrt(2)/2, sqrt(2)].  */
+  v_u64_t utop = (u & 0x000fffff00000000) + HfRt2Top;
+  v_u64_t u_red = utop | (mi & BottomMask);
+  v_f64_t f = v_as_f64_u64 (u_red) - 1;
+
+  /* Correction term c/m.  */
+  v_f64_t cm = (x - (m - 1)) / m;
+
+#ifndef WANT_V_LOG1P_K0_SHORTCUT
+#error                                                                         \
+  "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
+#elif WANT_V_LOG1P_K0_SHORTCUT
+  /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
+     that the approximation is solely the polynomial. */
+  v_u64_t k0 = k == 0;
+  if (unlikely (v_any_u64 (k0)))
+    {
+      cm = v_sel_f64 (k0, v_f64 (0), cm);
+      f = v_sel_f64 (k0, x, f);
+    }
+#endif
+
+  /* Approximate log1p(f) on the reduced input using a polynomial.  */
+  v_f64_t f2 = f * f;
+  v_f64_t p = PAIRWISE_HORNER_18 (f, f2, C);
+
+  /* Assemble log1p(x) = k * log2 + log1p(f) + c/m.  */
+  v_f64_t ylo = v_fma_f64 (k, Ln2Lo, cm);
+  v_f64_t yhi = v_fma_f64 (k, Ln2Hi, f);
+  return v_fma_f64 (f2, p, ylo + yhi);
+}
+
+#endif // PL_MATH_V_LOG1P_INLINE_H
diff --git a/pl/math/v_log1pf_2u1.c b/pl/math/v_log1pf_2u1.c
new file mode 100644
index 0000000..4a7732b
--- /dev/null
+++ b/pl/math/v_log1pf_2u1.c
@@ -0,0 +1,160 @@
+/*
+ * Single-precision vector log(1+x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define AbsMask 0x7fffffff
+#define TinyBound 0x340 /* asuint32(0x1p-23). ulp=0.5 at 0x1p-23.  */
+#define MinusOne 0xbf800000
+#define Ln2 (0x1.62e43p-1f)
+#define Four 0x40800000
+#define ThreeQuarters v_u32 (0x3f400000)
+
+#define C(i) v_f32 (__log1pf_data.coeffs[i])
+
+static inline v_f32_t
+eval_poly (v_f32_t m)
+{
+#ifdef V_LOG1PF_1U3
+
+  /* Approximate log(1+m) on [-0.25, 0.5] using Horner scheme.  */
+  v_f32_t p = v_fma_f32 (C (8), m, C (7));
+  p = v_fma_f32 (p, m, C (6));
+  p = v_fma_f32 (p, m, C (5));
+  p = v_fma_f32 (p, m, C (4));
+  p = v_fma_f32 (p, m, C (3));
+  p = v_fma_f32 (p, m, C (2));
+  p = v_fma_f32 (p, m, C (1));
+  p = v_fma_f32 (p, m, C (0));
+  return v_fma_f32 (m, m * p, m);
+
+#elif defined(V_LOG1PF_2U5)
+
+  /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme.  */
+  v_f32_t p_12 = v_fma_f32 (m, C (1), C (0));
+  v_f32_t p_34 = v_fma_f32 (m, C (3), C (2));
+  v_f32_t p_56 = v_fma_f32 (m, C (5), C (4));
+  v_f32_t p_78 = v_fma_f32 (m, C (7), C (6));
+
+  v_f32_t m2 = m * m;
+  v_f32_t p_02 = v_fma_f32 (m2, p_12, m);
+  v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34);
+  v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78);
+
+  v_f32_t m4 = m2 * m2;
+  v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02);
+
+  return v_fma_f32 (m4, m4 * p_79, p_06);
+
+#else
+#error No precision specified for v_log1pf
+#endif
+}
+
+static inline float
+handle_special (float x)
+{
+  uint32_t ix = asuint (x);
+  uint32_t ia = ix & AbsMask;
+  if (ix == 0xff800000 || ia > 0x7f800000 || ix > 0xbf800000)
+    {
+      /* x == -Inf   => log1pf(x) = NaN.
+	 x <  -1.0   => log1pf(x) = NaN.
+	 x == +/-NaN => log1pf(x) = NaN.  */
+#if WANT_SIMD_EXCEPT
+      return __math_invalidf (asfloat (ia));
+#else
+      return NAN;
+#endif
+    }
+  if (ix == 0xbf800000)
+    {
+      /* x == -1.0 => log1pf(x) = -Inf.  */
+#if WANT_SIMD_EXCEPT
+      return __math_divzerof (ix);
+#else
+      return -INFINITY;
+#endif
+    }
+  /* |x| < TinyBound => log1p(x)  =  x.  */
+  return x;
+}
+
+/* Vector log1pf approximation using polynomial on reduced interval. Accuracy is
+   the same as for the scalar algorithm, i.e. worst-case error when using Estrin
+   is roughly 2.02 ULP:
+   log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3.  */
+VPCS_ATTR v_f32_t V_NAME (log1pf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t ia12 = (ix >> 20) & v_u32 (0x7f8);
+  v_u32_t special_cases
+    = v_cond_u32 (ia12 - v_u32 (TinyBound) >= (0x7f8 - TinyBound))
+      | v_cond_u32 (ix >= MinusOne);
+  v_f32_t special_arg = x;
+
+#if WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u32 (special_cases)))
+    /* Side-step special lanes so fenv exceptions are not triggered
+       inadvertently.  */
+    x = v_sel_f32 (special_cases, v_f32 (1), x);
+#endif
+
+  /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+			   is in [-0.25, 0.5]):
+     log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+
+     We approximate log1p(m) with a polynomial, then scale by
+     k*log(2). Instead of doing this directly, we use an intermediate
+     scale factor s = 4*k*log(2) to ensure the scale is representable
+     as a normalised fp32 number.  */
+
+  v_f32_t m = x + v_f32 (1.0f);
+
+  /* Choose k to scale x to the range [-1/4, 1/2].  */
+  v_s32_t k = (v_as_s32_f32 (m) - ThreeQuarters) & v_u32 (0xff800000);
+
+  /* Scale x by exponent manipulation.  */
+  v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - v_as_u32_s32 (k));
+
+  /* Scale up to ensure that the scale factor is representable as normalised
+     fp32 number, and scale m down accordingly.  */
+  v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k);
+  m_scale = m_scale + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f));
+
+  /* Evaluate polynomial on the reduced interval.  */
+  v_f32_t p = eval_poly (m_scale);
+
+  /* The scale factor to be applied back at the end - by multiplying float(k)
+     by 2^-23 we get the unbiased exponent of k.  */
+  v_f32_t scale_back = v_to_f32_s32 (k) * v_f32 (0x1p-23f);
+
+  /* Apply the scaling back.  */
+  v_f32_t y = v_fma_f32 (scale_back, v_f32 (Ln2), p);
+
+  if (unlikely (v_any_u32 (special_cases)))
+    return v_call_f32 (handle_special, special_arg, y, special_cases);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, log1p, -0.9, 10.0)
+PL_TEST_ULP (V_NAME (log1pf), 1.53)
+PL_TEST_EXPECT_FENV (V_NAME (log1pf), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (log1pf), -10.0, 10.0, 10000)
+PL_TEST_INTERVAL (V_NAME (log1pf), 0.0, 0x1p-23, 30000)
+PL_TEST_INTERVAL (V_NAME (log1pf), 0x1p-23, 0.001, 50000)
+PL_TEST_INTERVAL (V_NAME (log1pf), 0.001, 1.0, 50000)
+PL_TEST_INTERVAL (V_NAME (log1pf), 0.0, -0x1p-23, 30000)
+PL_TEST_INTERVAL (V_NAME (log1pf), -0x1p-23, -0.001, 30000)
+PL_TEST_INTERVAL (V_NAME (log1pf), -0.001, -1.0, 50000)
+PL_TEST_INTERVAL (V_NAME (log1pf), -1.0, inf, 1000)
+#endif
diff --git a/pl/math/v_log1pf_inline.h b/pl/math/v_log1pf_inline.h
new file mode 100644
index 0000000..e3048e6
--- /dev/null
+++ b/pl/math/v_log1pf_inline.h
@@ -0,0 +1,55 @@
+/*
+ * Helper for single-precision routines which calculate log(1 + x) and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_V_LOG1PF_INLINE_H
+#define PL_MATH_V_LOG1PF_INLINE_H
+
+#include "v_math.h"
+#include "math_config.h"
+
+#define Four 0x40800000
+#define Ln2 v_f32 (0x1.62e43p-1f)
+
+#define C(i) v_f32 (__log1pf_data.coeffs[i])
+
+static inline v_f32_t
+eval_poly (v_f32_t m)
+{
+  /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme.  */
+  v_f32_t p_12 = v_fma_f32 (m, C (1), C (0));
+  v_f32_t p_34 = v_fma_f32 (m, C (3), C (2));
+  v_f32_t p_56 = v_fma_f32 (m, C (5), C (4));
+  v_f32_t p_78 = v_fma_f32 (m, C (7), C (6));
+
+  v_f32_t m2 = m * m;
+  v_f32_t p_02 = v_fma_f32 (m2, p_12, m);
+  v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34);
+  v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78);
+
+  v_f32_t m4 = m2 * m2;
+  v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02);
+
+  return v_fma_f32 (m4, m4 * p_79, p_06);
+}
+
+static inline v_f32_t
+log1pf_inline (v_f32_t x)
+{
+  /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
+     special-case handling. See that file for details of the algorithm.  */
+  v_f32_t m = x + 1.0f;
+  v_u32_t k = (v_as_u32_f32 (m) - 0x3f400000) & 0xff800000;
+  v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k);
+  v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - k)
+		    + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f));
+  v_f32_t p = eval_poly (m_scale);
+  v_f32_t scale_back = v_to_f32_u32 (k) * 0x1.0p-23f;
+  return v_fma_f32 (scale_back, Ln2, p);
+}
+
+#endif //  PL_MATH_V_LOG1PF_INLINE_H
diff --git a/pl/math/v_log2_3u.c b/pl/math/v_log2_3u.c
new file mode 100644
index 0000000..fac73f6
--- /dev/null
+++ b/pl/math/v_log2_3u.c
@@ -0,0 +1,100 @@
+/*
+ * Double-precision vector log2 function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "include/mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define InvLn2 v_f64 (0x1.71547652b82fep0)
+#define N (1 << V_LOG2_TABLE_BITS)
+#define OFF v_u64 (0x3fe6900900000000)
+#define P(i) v_f64 (__v_log2_data.poly[i])
+
+struct entry
+{
+  v_f64_t invc;
+  v_f64_t log2c;
+};
+
+static inline struct entry
+lookup (v_u64_t i)
+{
+  struct entry e;
+#ifdef SCALAR
+  e.invc = __v_log2_data.tab[i].invc;
+  e.log2c = __v_log2_data.tab[i].log2c;
+#else
+  e.invc[0] = __v_log2_data.tab[i[0]].invc;
+  e.log2c[0] = __v_log2_data.tab[i[0]].log2c;
+  e.invc[1] = __v_log2_data.tab[i[1]].invc;
+  e.log2c[1] = __v_log2_data.tab[i[1]].log2c;
+#endif
+  return e;
+}
+
+VPCS_ATTR
+NOINLINE static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+{
+  return v_call_f64 (log2, x, y, cmp);
+}
+
+/* Double-precision vector log2 routine. Implements the same algorithm as vector
+   log10, with coefficients and table entries scaled in extended precision.
+   The maximum observed error is 2.58 ULP:
+   __v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5
+				 want 0x1.fffb34198d9ddp-5.  */
+VPCS_ATTR
+v_f64_t V_NAME (log2) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t top = ix >> 48;
+  v_u64_t special
+    = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010));
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  v_u64_t tmp = ix - OFF;
+  v_u64_t i = (tmp >> (52 - V_LOG2_TABLE_BITS)) % N;
+  v_s64_t k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift.  */
+  v_u64_t iz = ix - (tmp & v_u64 (0xfffULL << 52));
+  v_f64_t z = v_as_f64_u64 (iz);
+  struct entry e = lookup (i);
+
+  /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k.  */
+
+  v_f64_t r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
+  v_f64_t kd = v_to_f64_s64 (k);
+  v_f64_t w = v_fma_f64 (r, InvLn2, e.log2c);
+
+  v_f64_t r2 = r * r;
+  v_f64_t p_23 = v_fma_f64 (P (3), r, P (2));
+  v_f64_t p_01 = v_fma_f64 (P (1), r, P (0));
+  v_f64_t y = v_fma_f64 (P (4), r2, p_23);
+  y = v_fma_f64 (r2, y, p_01);
+  y = v_fma_f64 (r2, y, kd + w);
+
+  if (unlikely (v_any_u64 (special)))
+    return specialcase (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, log2, 0.01, 11.1)
+PL_TEST_ULP (V_NAME (log2), 2.09)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log2))
+PL_TEST_INTERVAL (V_NAME (log2), -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (V_NAME (log2), 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (V_NAME (log2), 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (V_NAME (log2), 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (V_NAME (log2), 1.0, 100, 50000)
+PL_TEST_INTERVAL (V_NAME (log2), 100, inf, 50000)
+#endif
diff --git a/pl/math/v_log2_data.c b/pl/math/v_log2_data.c
new file mode 100644
index 0000000..2a1da68
--- /dev/null
+++ b/pl/math/v_log2_data.c
@@ -0,0 +1,155 @@
+/*
+ * Coefficients and table entries for vector log2
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define N (1 << V_LOG2_TABLE_BITS)
+
+// clang-format off
+
+const struct v_log2_data __v_log2_data = {
+
+/* Derived from the coefficients in log_data.c for N == 128 && LOG_POLY_ORDER == 6.
+   Each coefficient was scaled by log2(e) in extended precision and rounded back to
+   double.  */
+.poly = { -0x1.71547652b83p-1,    0x1.ec709dc340953p-2, -0x1.71547651c8f35p-2,
+	   0x1.2777ebe12dda5p-2, -0x1.ec738d616fe26p-3 },
+
+/* Derived from the table in v_log10_data.c. invc is unchanged. log2(c) was
+   calculated by scaling log10(c) by log2(10) in extended precision and rounding
+   back.  */
+.tab = {
+{ 0x1.6a133d0dec120p+0,  -0x1.00130d57f5fadp-1 },
+{ 0x1.6815f2f3e42edp+0,  -0x1.f802661bd725ep-2 },
+{ 0x1.661e39be1ac9ep+0,  -0x1.efea1c6f73a5bp-2 },
+{ 0x1.642bfa30ac371p+0,  -0x1.e7dd1dcd06f05p-2 },
+{ 0x1.623f1d916f323p+0,  -0x1.dfdb4ae024809p-2 },
+{ 0x1.60578da220f65p+0,  -0x1.d7e484d101958p-2 },
+{ 0x1.5e75349dea571p+0,  -0x1.cff8ad452f6ep-2 },
+{ 0x1.5c97fd387a75ap+0,  -0x1.c817a666c997fp-2 },
+{ 0x1.5abfd2981f200p+0,  -0x1.c04152d640419p-2 },
+{ 0x1.58eca051dc99cp+0,  -0x1.b87595a3f64b2p-2 },
+{ 0x1.571e526d9df12p+0,  -0x1.b0b4526c44d07p-2 },
+{ 0x1.5554d555b3fcbp+0,  -0x1.a8fd6d1a90f5ep-2 },
+{ 0x1.539015e2a20cdp+0,  -0x1.a150ca2559fc6p-2 },
+{ 0x1.51d0014ee0164p+0,  -0x1.99ae4e62cca29p-2 },
+{ 0x1.50148538cd9eep+0,  -0x1.9215df1a1e842p-2 },
+{ 0x1.4e5d8f9f698a1p+0,  -0x1.8a8761fe1f0d9p-2 },
+{ 0x1.4cab0edca66bep+0,  -0x1.8302bd1cc9a54p-2 },
+{ 0x1.4afcf1a9db874p+0,  -0x1.7b87d6fb437f6p-2 },
+{ 0x1.495327136e16fp+0,  -0x1.741696673a86dp-2 },
+{ 0x1.47ad9e84af28fp+0,  -0x1.6caee2b3c6fe4p-2 },
+{ 0x1.460c47b39ae15p+0,  -0x1.6550a3666c27ap-2 },
+{ 0x1.446f12b278001p+0,  -0x1.5dfbc08de02a4p-2 },
+{ 0x1.42d5efdd720ecp+0,  -0x1.56b022766c84ap-2 },
+{ 0x1.4140cfe001a0fp+0,  -0x1.4f6db1c955536p-2 },
+{ 0x1.3fafa3b421f69p+0,  -0x1.4834579063054p-2 },
+{ 0x1.3e225c9c8ece5p+0,  -0x1.4103fd2249a76p-2 },
+{ 0x1.3c98ec29a211ap+0,  -0x1.39dc8c3fe6dabp-2 },
+{ 0x1.3b13442a413fep+0,  -0x1.32bdeed4b5c8fp-2 },
+{ 0x1.399156baa3c54p+0,  -0x1.2ba80f41e20ddp-2 },
+{ 0x1.38131639b4cdbp+0,  -0x1.249ad8332f4a7p-2 },
+{ 0x1.36987540fbf53p+0,  -0x1.1d96347e7f3ebp-2 },
+{ 0x1.352166b648f61p+0,  -0x1.169a0f7d6604ap-2 },
+{ 0x1.33adddb3eb575p+0,  -0x1.0fa654a221909p-2 },
+{ 0x1.323dcd99fc1d3p+0,  -0x1.08baefcf8251ap-2 },
+{ 0x1.30d129fefc7d2p+0,  -0x1.01d7cd14deecdp-2 },
+{ 0x1.2f67e6b72fe7dp+0,  -0x1.f5f9b1ad55495p-3 },
+{ 0x1.2e01f7cf8b187p+0,  -0x1.e853ff76a77afp-3 },
+{ 0x1.2c9f518ddc86ep+0,  -0x1.dabe5d624cba1p-3 },
+{ 0x1.2b3fe86e5f413p+0,  -0x1.cd38a5cef4822p-3 },
+{ 0x1.29e3b1211b25cp+0,  -0x1.bfc2b38d315f9p-3 },
+{ 0x1.288aa08b373cfp+0,  -0x1.b25c61f5edd0fp-3 },
+{ 0x1.2734abcaa8467p+0,  -0x1.a5058d18e9cacp-3 },
+{ 0x1.25e1c82459b81p+0,  -0x1.97be1113e47a3p-3 },
+{ 0x1.2491eb1ad59c5p+0,  -0x1.8a85cafdf5e27p-3 },
+{ 0x1.23450a54048b5p+0,  -0x1.7d5c97e8fc45bp-3 },
+{ 0x1.21fb1bb09e578p+0,  -0x1.704255d6486e4p-3 },
+{ 0x1.20b415346d8f7p+0,  -0x1.6336e2cedd7bfp-3 },
+{ 0x1.1f6fed179a1acp+0,  -0x1.563a1d9b0cc6ap-3 },
+{ 0x1.1e2e99b93c7b3p+0,  -0x1.494be541aaa6fp-3 },
+{ 0x1.1cf011a7a882ap+0,  -0x1.3c6c1964dd0f2p-3 },
+{ 0x1.1bb44b97dba5ap+0,  -0x1.2f9a99f19a243p-3 },
+{ 0x1.1a7b3e66cdd4fp+0,  -0x1.22d747344446p-3 },
+{ 0x1.1944e11dc56cdp+0,  -0x1.1622020d4f7f5p-3 },
+{ 0x1.18112aebb1a6ep+0,  -0x1.097aabb3553f3p-3 },
+{ 0x1.16e013231b7e9p+0,  -0x1.f9c24b48014c5p-4 },
+{ 0x1.15b1913f156cfp+0,  -0x1.e0aaa3bdc858ap-4 },
+{ 0x1.14859cdedde13p+0,  -0x1.c7ae257c952d6p-4 },
+{ 0x1.135c2dc68cfa4p+0,  -0x1.aecc960a03e58p-4 },
+{ 0x1.12353bdb01684p+0,  -0x1.9605bb724d541p-4 },
+{ 0x1.1110bf25b85b4p+0,  -0x1.7d595ca7147cep-4 },
+{ 0x1.0feeafd2f8577p+0,  -0x1.64c74165002d9p-4 },
+{ 0x1.0ecf062c51c3bp+0,  -0x1.4c4f31c86d344p-4 },
+{ 0x1.0db1baa076c8bp+0,  -0x1.33f0f70388258p-4 },
+{ 0x1.0c96c5bb3048ep+0,  -0x1.1bac5abb3037dp-4 },
+{ 0x1.0b7e20263e070p+0,  -0x1.0381272495f21p-4 },
+{ 0x1.0a67c2acd0ce3p+0,  -0x1.d6de4eba2de2ap-5 },
+{ 0x1.0953a6391e982p+0,  -0x1.a6ec4e8156898p-5 },
+{ 0x1.0841c3caea380p+0,  -0x1.772be542e3e1bp-5 },
+{ 0x1.07321489b13eap+0,  -0x1.479cadcde852dp-5 },
+{ 0x1.062491aee9904p+0,  -0x1.183e4265faa5p-5 },
+{ 0x1.05193497a7cc5p+0,  -0x1.d2207fdaa1b85p-6 },
+{ 0x1.040ff6b5f5e9fp+0,  -0x1.742486cb4a6a2p-6 },
+{ 0x1.0308d19aa6127p+0,  -0x1.1687d77cfc299p-6 },
+{ 0x1.0203beedb0c67p+0,  -0x1.7293623a6b5dep-7 },
+{ 0x1.010037d38bcc2p+0,  -0x1.70ec80ec8f25dp-8 },
+{ 1.0,   0.0 },
+{ 0x1.fc06d493cca10p-1,  0x1.704c1ca6b6bc9p-7 },
+{ 0x1.f81e6ac3b918fp-1,  0x1.6eac8ba664beap-6 },
+{ 0x1.f44546ef18996p-1,  0x1.11e67d040772dp-5 },
+{ 0x1.f07b10382c84bp-1,  0x1.6bc665e2105dep-5 },
+{ 0x1.ecbf7070e59d4p-1,  0x1.c4f8a9772bf1dp-5 },
+{ 0x1.e91213f715939p-1,  0x1.0ebff10fbb951p-4 },
+{ 0x1.e572a9a75f7b7p-1,  0x1.3aaf4d7805d11p-4 },
+{ 0x1.e1e0e2c530207p-1,  0x1.664ba81a4d717p-4 },
+{ 0x1.de5c72d8a8be3p-1,  0x1.9196387da6de4p-4 },
+{ 0x1.dae50fa5658ccp-1,  0x1.bc902f2b7796p-4 },
+{ 0x1.d77a71145a2dap-1,  0x1.e73ab5f584f28p-4 },
+{ 0x1.d41c51166623ep-1,  0x1.08cb78510d232p-3 },
+{ 0x1.d0ca6ba0bb29fp-1,  0x1.1dd2fe2f0dcb5p-3 },
+{ 0x1.cd847e8e59681p-1,  0x1.32b4784400df4p-3 },
+{ 0x1.ca4a499693e00p-1,  0x1.47706f3d49942p-3 },
+{ 0x1.c71b8e399e821p-1,  0x1.5c0768ee4a4dcp-3 },
+{ 0x1.c3f80faf19077p-1,  0x1.7079e86fc7c6dp-3 },
+{ 0x1.c0df92dc2b0ecp-1,  0x1.84c86e1183467p-3 },
+{ 0x1.bdd1de3cbb542p-1,  0x1.98f377a34b499p-3 },
+{ 0x1.baceb9e1007a3p-1,  0x1.acfb803bc924bp-3 },
+{ 0x1.b7d5ef543e55ep-1,  0x1.c0e10098b025fp-3 },
+{ 0x1.b4e749977d953p-1,  0x1.d4a46efe103efp-3 },
+{ 0x1.b20295155478ep-1,  0x1.e8463f45b8d0bp-3 },
+{ 0x1.af279f8e82be2p-1,  0x1.fbc6e3228997fp-3 },
+{ 0x1.ac5638197fdf3p-1,  0x1.079364f2e5aa8p-2 },
+{ 0x1.a98e2f102e087p-1,  0x1.1133306010a63p-2 },
+{ 0x1.a6cf5606d05c1p-1,  0x1.1ac309631bd17p-2 },
+{ 0x1.a4197fc04d746p-1,  0x1.24432485370c1p-2 },
+{ 0x1.a16c80293dc01p-1,  0x1.2db3b5449132fp-2 },
+{ 0x1.9ec82c4dc5bc9p-1,  0x1.3714ee1d7a32p-2 },
+{ 0x1.9c2c5a491f534p-1,  0x1.406700ab52c94p-2 },
+{ 0x1.9998e1480b618p-1,  0x1.49aa1d87522b2p-2 },
+{ 0x1.970d9977c6c2dp-1,  0x1.52de746d7ecb2p-2 },
+{ 0x1.948a5c023d212p-1,  0x1.5c0434336b343p-2 },
+{ 0x1.920f0303d6809p-1,  0x1.651b8ad6c90d1p-2 },
+{ 0x1.8f9b698a98b45p-1,  0x1.6e24a56ab5831p-2 },
+{ 0x1.8d2f6b81726f6p-1,  0x1.771fb04ec29b1p-2 },
+{ 0x1.8acae5bb55badp-1,  0x1.800cd6f19c25ep-2 },
+{ 0x1.886db5d9275b8p-1,  0x1.88ec441df11dfp-2 },
+{ 0x1.8617ba567c13cp-1,  0x1.91be21b7c93f5p-2 },
+{ 0x1.83c8d27487800p-1,  0x1.9a8298f8c7454p-2 },
+{ 0x1.8180de3c5dbe7p-1,  0x1.a339d255c04ddp-2 },
+{ 0x1.7f3fbe71cdb71p-1,  0x1.abe3f59f43db7p-2 },
+{ 0x1.7d055498071c1p-1,  0x1.b48129deca9efp-2 },
+{ 0x1.7ad182e54f65ap-1,  0x1.bd119575364c1p-2 },
+{ 0x1.78a42c3c90125p-1,  0x1.c5955e23ebcbcp-2 },
+{ 0x1.767d342f76944p-1,  0x1.ce0ca8f4e1557p-2 },
+{ 0x1.745c7ef26b00ap-1,  0x1.d6779a5a75774p-2 },
+{ 0x1.7241f15769d0fp-1,  0x1.ded6563550d27p-2 },
+{ 0x1.702d70d396e41p-1,  0x1.e728ffafd840ep-2 },
+{ 0x1.6e1ee3700cd11p-1,  0x1.ef6fb96c8d739p-2 },
+{ 0x1.6c162fc9cbe02p-1,  0x1.f7aaa57907219p-2 }}
+};
+// clang-format on
diff --git a/pl/math/v_log2f_2u5.c b/pl/math/v_log2f_2u5.c
new file mode 100644
index 0000000..8f9241b
--- /dev/null
+++ b/pl/math/v_log2f_2u5.c
@@ -0,0 +1,68 @@
+/*
+ * Single-precision vector log2 function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pairwise_hornerf.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+#if V_SUPPORTED
+
+#define C(i) v_f32 (__v_log2f_data.poly[i])
+
+#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218 */
+#define Min v_u32 (0x00800000)
+#define Max v_u32 (0x7f800000)
+#define Mask v_u32 (0x007fffff)
+#define Off v_u32 (0x3f2aaaab) /* 0.666667 */
+
+VPCS_ATTR
+NOINLINE static v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+  /* Fall back to scalar code.  */
+  return v_call_f32 (log2f, x, y, cmp);
+}
+
+/* Fast implementation for single precision log2,
+   relies on same argument reduction as Neon logf.
+   Maximum error: 2.48 ULPs
+   __v_log2f(0x1.558174p+0) got 0x1.a9be84p-2
+			   want 0x1.a9be8p-2.  */
+VPCS_ATTR
+v_f32_t V_NAME (log2f) (v_f32_t x)
+{
+  v_u32_t u = v_as_u32_f32 (x);
+  v_u32_t cmp = v_cond_u32 (u - Min >= Max - Min);
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  u -= Off;
+  v_f32_t n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend.  */
+  u &= Mask;
+  u += Off;
+  v_f32_t r = v_as_f32_u32 (u) - v_f32 (1.0f);
+
+  /* y = log2(1+r) + n.  */
+  v_f32_t r2 = r * r;
+  v_f32_t p = PAIRWISE_HORNER_8 (r, r2, C);
+  v_f32_t y = v_fma_f32 (p, r, n);
+
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, log2, 0.01, 11.1)
+PL_TEST_ULP (V_NAME (log2f), 1.99)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log2f))
+PL_TEST_INTERVAL (V_NAME (log2f), -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (V_NAME (log2f), 1.0, 100, 50000)
+PL_TEST_INTERVAL (V_NAME (log2f), 100, inf, 50000)
+#endif
diff --git a/pl/math/v_log2f_data.c b/pl/math/v_log2f_data.c
new file mode 100644
index 0000000..b144e8f
--- /dev/null
+++ b/pl/math/v_log2f_data.c
@@ -0,0 +1,15 @@
+/*
+ * Coefficients for vector log2f
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* See tools/v_log2f.sollya for the algorithm used to generate these
+   coefficients.  */
+const struct v_log2f_data __v_log2f_data
+  = {.poly = {0x1.715476p0f, /* (float)(1 / ln(2)).  */
+	      -0x1.715458p-1f, 0x1.ec701cp-2f, -0x1.7171a4p-2f, 0x1.27a0b8p-2f,
+	      -0x1.e5143ep-3f, 0x1.9d8ecap-3f, -0x1.c675bp-3f, 0x1.9e495p-3f}};
diff --git a/pl/math/v_math.h b/pl/math/v_math.h
new file mode 100644
index 0000000..a8fa091
--- /dev/null
+++ b/pl/math/v_math.h
@@ -0,0 +1,855 @@
+/*
+ * Vector math abstractions.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _V_MATH_H
+#define _V_MATH_H
+
+#ifndef WANT_VMATH
+/* Enable the build of vector math code.  */
+# define WANT_VMATH 1
+#endif
+#if WANT_VMATH
+
+/* The goal of this header is to allow vector (only Neon for now)
+   and scalar build of the same algorithm. */
+
+#if SCALAR
+#define V_NAME(x) __s_##x
+#elif VPCS && __aarch64__
+#define V_NAME(x) __vn_##x
+#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
+#else
+#define V_NAME(x) __v_##x
+#endif
+
+#ifndef VPCS_ATTR
+#define VPCS_ATTR
+#endif
+#ifndef VPCS_ALIAS
+#define VPCS_ALIAS
+#endif
+
+#include <stdint.h>
+#include "math_config.h"
+
+typedef float f32_t;
+typedef uint32_t u32_t;
+typedef int32_t s32_t;
+typedef double f64_t;
+typedef uint64_t u64_t;
+typedef int64_t s64_t;
+
+/* reinterpret as type1 from type2.  */
+static inline u32_t
+as_u32_f32 (f32_t x)
+{
+  union { f32_t f; u32_t u; } r = {x};
+  return r.u;
+}
+static inline f32_t
+as_f32_u32 (u32_t x)
+{
+  union { u32_t u; f32_t f; } r = {x};
+  return r.f;
+}
+static inline s32_t
+as_s32_u32 (u32_t x)
+{
+  union { u32_t u; s32_t i; } r = {x};
+  return r.i;
+}
+static inline u32_t
+as_u32_s32 (s32_t x)
+{
+  union { s32_t i; u32_t u; } r = {x};
+  return r.u;
+}
+static inline u64_t
+as_u64_f64 (f64_t x)
+{
+  union { f64_t f; u64_t u; } r = {x};
+  return r.u;
+}
+static inline f64_t
+as_f64_u64 (u64_t x)
+{
+  union { u64_t u; f64_t f; } r = {x};
+  return r.f;
+}
+static inline s64_t
+as_s64_u64 (u64_t x)
+{
+  union { u64_t u; s64_t i; } r = {x};
+  return r.i;
+}
+static inline u64_t
+as_u64_s64 (s64_t x)
+{
+  union { s64_t i; u64_t u; } r = {x};
+  return r.u;
+}
+
+#if SCALAR
+#define V_SUPPORTED 1
+typedef f32_t v_f32_t;
+typedef u32_t v_u32_t;
+typedef s32_t v_s32_t;
+typedef f64_t v_f64_t;
+typedef u64_t v_u64_t;
+typedef s64_t v_s64_t;
+
+static inline int
+v_lanes32 (void)
+{
+  return 1;
+}
+
+static inline v_f32_t
+v_f32 (f32_t x)
+{
+  return x;
+}
+static inline v_u32_t
+v_u32 (u32_t x)
+{
+  return x;
+}
+static inline v_s32_t
+v_s32 (s32_t x)
+{
+  return x;
+}
+
+static inline f32_t
+v_get_f32 (v_f32_t x, int i)
+{
+  return x;
+}
+static inline u32_t
+v_get_u32 (v_u32_t x, int i)
+{
+  return x;
+}
+static inline s32_t
+v_get_s32 (v_s32_t x, int i)
+{
+  return x;
+}
+
+static inline void
+v_set_f32 (v_f32_t *x, int i, f32_t v)
+{
+  *x = v;
+}
+static inline void
+v_set_u32 (v_u32_t *x, int i, u32_t v)
+{
+  *x = v;
+}
+static inline void
+v_set_s32 (v_s32_t *x, int i, s32_t v)
+{
+  *x = v;
+}
+
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u32 (v_u32_t x)
+{
+  return x != 0;
+}
+/* to wrap the result of relational operators.  */
+static inline v_u32_t
+v_cond_u32 (v_u32_t x)
+{
+  return x ? -1 : 0;
+}
+static inline v_f32_t
+v_abs_f32 (v_f32_t x)
+{
+  return __builtin_fabsf (x);
+}
+static inline v_u32_t
+v_bsl_u32 (v_u32_t m, v_u32_t x, v_u32_t y)
+{
+  return (y & ~m) | (x & m);
+}
+static inline v_u32_t
+v_cagt_f32 (v_f32_t x, v_f32_t y)
+{
+  return fabsf (x) > fabsf (y);
+}
+/* to wrap |x| >= |y|.  */
+static inline v_u32_t
+v_cage_f32 (v_f32_t x, v_f32_t y)
+{
+  return fabsf (x) >= fabsf (y);
+}
+static inline v_u32_t
+v_calt_f32 (v_f32_t x, v_f32_t y)
+{
+  return fabsf (x) < fabsf (y);
+}
+static inline v_f32_t
+v_div_f32 (v_f32_t x, v_f32_t y)
+{
+  return x / y;
+}
+static inline v_f32_t
+v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
+{
+  return __builtin_fmaf (x, y, z);
+}
+static inline v_f32_t
+v_round_f32 (v_f32_t x)
+{
+  return __builtin_roundf (x);
+}
+static inline v_s32_t
+v_round_s32 (v_f32_t x)
+{
+  return __builtin_lroundf (x); /* relies on -fno-math-errno.  */
+}
+static inline v_f32_t
+v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y)
+{
+  return p ? x : y;
+}
+static inline v_u32_t
+v_sel_u32 (v_u32_t p, v_u32_t x, v_u32_t y)
+{
+  return p ? x : y;
+}
+static inline v_f32_t
+v_sqrt_f32 (v_f32_t x)
+{
+  return __builtin_sqrtf (x);
+}
+/* convert to type1 from type2.  */
+static inline v_f32_t
+v_to_f32_s32 (v_s32_t x)
+{
+  return x;
+}
+static inline v_s32_t
+v_to_s32_f32 (v_f32_t x)
+{
+  return x;
+}
+static inline v_f32_t
+v_to_f32_u32 (v_u32_t x)
+{
+  return x;
+}
+/* reinterpret as type1 from type2.  */
+static inline v_u32_t
+v_as_u32_f32 (v_f32_t x)
+{
+  union { v_f32_t f; v_u32_t u; } r = {x};
+  return r.u;
+}
+static inline v_s32_t
+v_as_s32_f32 (v_f32_t x)
+{
+  union
+  {
+    v_f32_t f;
+    v_s32_t u;
+  } r = {x};
+  return r.u;
+}
+static inline v_f32_t
+v_as_f32_u32 (v_u32_t x)
+{
+  union { v_u32_t u; v_f32_t f; } r = {x};
+  return r.f;
+}
+static inline v_s32_t
+v_as_s32_u32 (v_u32_t x)
+{
+  union { v_u32_t u; v_s32_t i; } r = {x};
+  return r.i;
+}
+static inline v_u32_t
+v_as_u32_s32 (v_s32_t x)
+{
+  union { v_s32_t i; v_u32_t u; } r = {x};
+  return r.u;
+}
+static inline v_f32_t
+v_lookup_f32 (const f32_t *tab, v_u32_t idx)
+{
+  return tab[idx];
+}
+static inline v_u32_t
+v_lookup_u32 (const u32_t *tab, v_u32_t idx)
+{
+  return tab[idx];
+}
+static inline v_f32_t
+v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
+{
+  return f (x);
+}
+static inline v_f32_t
+v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
+	     v_u32_t p)
+{
+  return f (x1, x2);
+}
+
+static inline int
+v_lanes64 (void)
+{
+  return 1;
+}
+static inline v_f64_t
+v_f64 (f64_t x)
+{
+  return x;
+}
+static inline v_u64_t
+v_u64 (u64_t x)
+{
+  return x;
+}
+static inline v_s64_t
+v_s64 (s64_t x)
+{
+  return x;
+}
+static inline f64_t
+v_get_f64 (v_f64_t x, int i)
+{
+  return x;
+}
+static inline void
+v_set_f64 (v_f64_t *x, int i, f64_t v)
+{
+  *x = v;
+}
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u64 (v_u64_t x)
+{
+  return x != 0;
+}
+/* true if all elements of a v_cond result is non-zero.  */
+static inline int
+v_all_u64 (v_u64_t x)
+{
+  return x;
+}
+/* to wrap the result of relational operators.  */
+static inline v_u64_t
+v_cond_u64 (v_u64_t x)
+{
+  return x ? -1 : 0;
+}
+static inline v_f64_t
+v_abs_f64 (v_f64_t x)
+{
+  return __builtin_fabs (x);
+}
+static inline v_u64_t
+v_bsl_u64 (v_u64_t m, v_u64_t x, v_u64_t y)
+{
+  return (y & ~m) | (x & m);
+}
+static inline v_u64_t
+v_cagt_f64 (v_f64_t x, v_f64_t y)
+{
+  return fabs (x) > fabs (y);
+}
+static inline v_f64_t
+v_div_f64 (v_f64_t x, v_f64_t y)
+{
+  return x / y;
+}
+static inline v_f64_t
+v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
+{
+  return __builtin_fma (x, y, z);
+}
+static inline v_f64_t
+v_min_f64(v_f64_t x, v_f64_t y) {
+  return x < y ? x : y;
+}
+static inline v_f64_t
+v_round_f64 (v_f64_t x)
+{
+  return __builtin_round (x);
+}
+static inline v_f64_t
+v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
+{
+  return p ? x : y;
+}
+static inline v_f64_t
+v_sqrt_f64 (v_f64_t x)
+{
+  return __builtin_sqrt (x);
+}
+static inline v_s64_t
+v_round_s64 (v_f64_t x)
+{
+  return __builtin_lround (x); /* relies on -fno-math-errno.  */
+}
+static inline v_u64_t
+v_trunc_u64 (v_f64_t x)
+{
+  return __builtin_trunc (x);
+}
+/* convert to type1 from type2.  */
+static inline v_f64_t
+v_to_f64_s64 (v_s64_t x)
+{
+  return x;
+}
+static inline v_f64_t
+v_to_f64_u64 (v_u64_t x)
+{
+  return x;
+}
+
+static inline v_s64_t
+v_to_s64_f64 (v_f64_t x)
+{
+  return x;
+}
+/* reinterpret as type1 from type2.  */
+static inline v_u64_t
+v_as_u64_f64 (v_f64_t x)
+{
+  union { v_f64_t f; v_u64_t u; } r = {x};
+  return r.u;
+}
+static inline v_f64_t
+v_as_f64_u64 (v_u64_t x)
+{
+  union { v_u64_t u; v_f64_t f; } r = {x};
+  return r.f;
+}
+static inline v_s64_t
+v_as_s64_u64 (v_u64_t x)
+{
+  union { v_u64_t u; v_s64_t i; } r = {x};
+  return r.i;
+}
+static inline v_u64_t
+v_as_u64_s64 (v_s64_t x)
+{
+  union { v_s64_t i; v_u64_t u; } r = {x};
+  return r.u;
+}
+static inline v_f64_t
+v_lookup_f64 (const f64_t *tab, v_u64_t idx)
+{
+  return tab[idx];
+}
+static inline v_u64_t
+v_lookup_u64 (const u64_t *tab, v_u64_t idx)
+{
+  return tab[idx];
+}
+static inline v_f64_t
+v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
+{
+  return f (x);
+}
+static inline v_f64_t
+v_call2_f64 (f64_t (*f) (f64_t, f64_t), v_f64_t x1, v_f64_t x2, v_f64_t y,
+	     v_u64_t p)
+{
+  return f (x1, x2);
+}
+
+#elif __aarch64__
+#define V_SUPPORTED 1
+#include <arm_neon.h>
+typedef float32x4_t v_f32_t;
+typedef uint32x4_t v_u32_t;
+typedef int32x4_t v_s32_t;
+typedef float64x2_t v_f64_t;
+typedef uint64x2_t v_u64_t;
+typedef int64x2_t v_s64_t;
+
+static inline int
+v_lanes32 (void)
+{
+  return 4;
+}
+
+static inline v_f32_t
+v_f32 (f32_t x)
+{
+  return (v_f32_t){x, x, x, x};
+}
+static inline v_u32_t
+v_u32 (u32_t x)
+{
+  return (v_u32_t){x, x, x, x};
+}
+static inline v_s32_t
+v_s32 (s32_t x)
+{
+  return (v_s32_t){x, x, x, x};
+}
+
+static inline f32_t
+v_get_f32 (v_f32_t x, int i)
+{
+  return x[i];
+}
+static inline u32_t
+v_get_u32 (v_u32_t x, int i)
+{
+  return x[i];
+}
+static inline s32_t
+v_get_s32 (v_s32_t x, int i)
+{
+  return x[i];
+}
+
+static inline void
+v_set_f32 (v_f32_t *x, int i, f32_t v)
+{
+  (*x)[i] = v;
+}
+static inline void
+v_set_u32 (v_u32_t *x, int i, u32_t v)
+{
+  (*x)[i] = v;
+}
+static inline void
+v_set_s32 (v_s32_t *x, int i, s32_t v)
+{
+  (*x)[i] = v;
+}
+
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u32 (v_u32_t x)
+{
+  /* assume elements in x are either 0 or -1u.  */
+  return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
+}
+/* to wrap the result of relational operators.  */
+static inline v_u32_t
+v_cond_u32 (v_u32_t x)
+{
+  return x;
+}
+static inline v_f32_t
+v_abs_f32 (v_f32_t x)
+{
+  return vabsq_f32 (x);
+}
+static inline v_u32_t
+v_bsl_u32 (v_u32_t m, v_u32_t x, v_u32_t y)
+{
+  return vbslq_u32 (m, x, y);
+}
+static inline v_u32_t
+v_cagt_f32 (v_f32_t x, v_f32_t y)
+{
+  return vcagtq_f32 (x, y);
+}
+/* to wrap |x| >= |y|.  */
+static inline v_u32_t
+v_cage_f32 (v_f32_t x, v_f32_t y)
+{
+  return vcageq_f32 (x, y);
+}
+static inline v_u32_t
+v_calt_f32 (v_f32_t x, v_f32_t y)
+{
+  return vcaltq_f32 (x, y);
+}
+static inline v_f32_t
+v_div_f32 (v_f32_t x, v_f32_t y)
+{
+  return vdivq_f32 (x, y);
+}
+static inline v_f32_t
+v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
+{
+  return vfmaq_f32 (z, x, y);
+}
+static inline v_f32_t
+v_round_f32 (v_f32_t x)
+{
+  return vrndaq_f32 (x);
+}
+static inline v_s32_t
+v_round_s32 (v_f32_t x)
+{
+  return vcvtaq_s32_f32 (x);
+}
+static inline v_f32_t
+v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y)
+{
+  return vbslq_f32 (p, x, y);
+}
+static inline v_u32_t
+v_sel_u32 (v_u32_t p, v_u32_t x, v_u32_t y)
+{
+  return vbslq_u32 (p, x, y);
+}
+static inline v_f32_t
+v_sqrt_f32 (v_f32_t x)
+{
+  return vsqrtq_f32 (x);
+}
+/* convert to type1 from type2.  */
+static inline v_f32_t
+v_to_f32_s32 (v_s32_t x)
+{
+  return (v_f32_t){x[0], x[1], x[2], x[3]};
+}
+static inline v_s32_t
+v_to_s32_f32 (v_f32_t x)
+{
+  return vcvtq_s32_f32 (x);
+}
+static inline v_f32_t
+v_to_f32_u32 (v_u32_t x)
+{
+  return (v_f32_t){x[0], x[1], x[2], x[3]};
+}
+/* reinterpret as type1 from type2.  */
+static inline v_u32_t
+v_as_u32_f32 (v_f32_t x)
+{
+  union { v_f32_t f; v_u32_t u; } r = {x};
+  return r.u;
+}
+static inline v_s32_t
+v_as_s32_f32 (v_f32_t x)
+{
+  union
+  {
+    v_f32_t f;
+    v_s32_t u;
+  } r = {x};
+  return r.u;
+}
+static inline v_f32_t
+v_as_f32_u32 (v_u32_t x)
+{
+  union { v_u32_t u; v_f32_t f; } r = {x};
+  return r.f;
+}
+static inline v_s32_t
+v_as_s32_u32 (v_u32_t x)
+{
+  union { v_u32_t u; v_s32_t i; } r = {x};
+  return r.i;
+}
+static inline v_u32_t
+v_as_u32_s32 (v_s32_t x)
+{
+  union { v_s32_t i; v_u32_t u; } r = {x};
+  return r.u;
+}
+static inline v_f32_t
+v_lookup_f32 (const f32_t *tab, v_u32_t idx)
+{
+  return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
+}
+static inline v_u32_t
+v_lookup_u32 (const u32_t *tab, v_u32_t idx)
+{
+  return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
+}
+static inline v_f32_t
+v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
+{
+  return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
+		   p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
+}
+static inline v_f32_t
+v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
+	     v_u32_t p)
+{
+  return (
+    v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1],
+	     p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]};
+}
+
+static inline int
+v_lanes64 (void)
+{
+  return 2;
+}
+static inline v_f64_t
+v_f64 (f64_t x)
+{
+  return (v_f64_t){x, x};
+}
+static inline v_u64_t
+v_u64 (u64_t x)
+{
+  return (v_u64_t){x, x};
+}
+static inline v_s64_t
+v_s64 (s64_t x)
+{
+  return (v_s64_t){x, x};
+}
+static inline f64_t
+v_get_f64 (v_f64_t x, int i)
+{
+  return x[i];
+}
+static inline void
+v_set_f64 (v_f64_t *x, int i, f64_t v)
+{
+  (*x)[i] = v;
+}
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u64 (v_u64_t x)
+{
+  /* assume elements in x are either 0 or -1u.  */
+  return vpaddd_u64 (x) != 0;
+}
+/* true if all elements of a v_cond result is 1.  */
+static inline int
+v_all_u64 (v_u64_t x)
+{
+  /* assume elements in x are either 0 or -1u.  */
+  return vpaddd_s64 (vreinterpretq_s64_u64 (x)) == -2;
+}
+/* to wrap the result of relational operators.  */
+static inline v_u64_t
+v_cond_u64 (v_u64_t x)
+{
+  return x;
+}
+static inline v_f64_t
+v_abs_f64 (v_f64_t x)
+{
+  return vabsq_f64 (x);
+}
+static inline v_u64_t
+v_bsl_u64 (v_u64_t m, v_u64_t x, v_u64_t y)
+{
+  return vbslq_u64 (m, x, y);
+}
+static inline v_u64_t
+v_cagt_f64 (v_f64_t x, v_f64_t y)
+{
+  return vcagtq_f64 (x, y);
+}
+static inline v_f64_t
+v_div_f64 (v_f64_t x, v_f64_t y)
+{
+  return vdivq_f64 (x, y);
+}
+static inline v_f64_t
+v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
+{
+  return vfmaq_f64 (z, x, y);
+}
+static inline v_f64_t
+v_min_f64(v_f64_t x, v_f64_t y) {
+  return vminq_f64(x, y);
+}
+static inline v_f64_t
+v_round_f64 (v_f64_t x)
+{
+  return vrndaq_f64 (x);
+}
+static inline v_f64_t
+v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
+{
+  return vbslq_f64 (p, x, y);
+}
+static inline v_f64_t
+v_sqrt_f64 (v_f64_t x)
+{
+  return vsqrtq_f64 (x);
+}
+static inline v_s64_t
+v_round_s64 (v_f64_t x)
+{
+  return vcvtaq_s64_f64 (x);
+}
+static inline v_u64_t
+v_trunc_u64 (v_f64_t x)
+{
+  return vcvtq_u64_f64 (x);
+}
+/* convert to type1 from type2.  */
+static inline v_f64_t
+v_to_f64_s64 (v_s64_t x)
+{
+  return (v_f64_t){x[0], x[1]};
+}
+static inline v_f64_t
+v_to_f64_u64 (v_u64_t x)
+{
+  return (v_f64_t){x[0], x[1]};
+}
+static inline v_s64_t
+v_to_s64_f64 (v_f64_t x)
+{
+  return vcvtq_s64_f64 (x);
+}
+/* reinterpret as type1 from type2.  */
+static inline v_u64_t
+v_as_u64_f64 (v_f64_t x)
+{
+  union { v_f64_t f; v_u64_t u; } r = {x};
+  return r.u;
+}
+static inline v_f64_t
+v_as_f64_u64 (v_u64_t x)
+{
+  union { v_u64_t u; v_f64_t f; } r = {x};
+  return r.f;
+}
+static inline v_s64_t
+v_as_s64_u64 (v_u64_t x)
+{
+  union {  v_u64_t u; v_s64_t i; } r = {x};
+  return r.i;
+}
+static inline v_u64_t
+v_as_u64_s64 (v_s64_t x)
+{
+  union { v_s64_t i; v_u64_t u; } r = {x};
+  return r.u;
+}
+static inline v_f64_t
+v_lookup_f64 (const f64_t *tab, v_u64_t idx)
+{
+  return (v_f64_t){tab[idx[0]], tab[idx[1]]};
+}
+static inline v_u64_t
+v_lookup_u64 (const u64_t *tab, v_u64_t idx)
+{
+  return (v_u64_t){tab[idx[0]], tab[idx[1]]};
+}
+static inline v_f64_t
+v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
+{
+  return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]};
+}
+static inline v_f64_t
+v_call2_f64 (f64_t (*f) (f64_t, f64_t), v_f64_t x1, v_f64_t x2, v_f64_t y,
+	     v_u64_t p)
+{
+  return (v_f64_t){p[0] ? f (x1[0], x2[0]) : y[0],
+		   p[1] ? f (x1[1], x2[1]) : y[1]};
+}
+#endif
+
+#endif
+#endif
diff --git a/pl/math/v_sinh_3u.c b/pl/math/v_sinh_3u.c
new file mode 100644
index 0000000..57ec66e
--- /dev/null
+++ b/pl/math/v_sinh_3u.c
@@ -0,0 +1,94 @@
+/*
+ * Double-precision vector sinh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "estrin.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define Half 0x3fe0000000000000
+#define BigBound                                                               \
+  0x4080000000000000 /* 2^9. expm1 helper overflows for large input.  */
+#define TinyBound                                                              \
+  0x3e50000000000000 /* 2^-26, below which sinh(x) rounds to x.  */
+#define InvLn2 v_f64 (0x1.71547652b82fep0)
+#define MLn2hi v_f64 (-0x1.62e42fefa39efp-1)
+#define MLn2lo v_f64 (-0x1.abc9e3b39803fp-56)
+#define Shift v_f64 (0x1.8p52)
+#define One 0x3ff0000000000000
+#define C(i) v_f64 (__expm1_poly[i])
+
+#if V_SUPPORTED
+
+static inline v_f64_t
+expm1_inline (v_f64_t x)
+{
+  /* Reduce argument:
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where i = round(x / ln2)
+     and   f = x - i * ln2 (f in [-ln2/2, ln2/2]).  */
+  v_f64_t j = v_fma_f64 (InvLn2, x, Shift) - Shift;
+  v_s64_t i = v_to_s64_f64 (j);
+  v_f64_t f = v_fma_f64 (j, MLn2hi, x);
+  f = v_fma_f64 (j, MLn2lo, f);
+  /* Approximate expm1(f) using polynomial.  */
+  v_f64_t f2 = f * f, f4 = f2 * f2, f8 = f4 * f4;
+  v_f64_t p = v_fma_f64 (f2, ESTRIN_10 (f, f2, f4, f8, C), f);
+  /* t = 2^i.  */
+  v_f64_t t = v_as_f64_u64 (v_as_u64_s64 (i << 52) + One);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return v_fma_f64 (p, t, t - 1);
+}
+
+static NOINLINE VPCS_ATTR v_f64_t
+special_case (v_f64_t x)
+{
+  return v_call_f64 (sinh, x, x, v_u64 (-1));
+}
+
+/* Approximation for vector double-precision sinh(x) using expm1.
+   sinh(x) = (exp(x) - exp(-x)) / 2.
+   The greatest observed error is 2.57 ULP:
+   sinh(0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2
+			     want 0x1.ab34e59d678d9p-2.  */
+VPCS_ATTR v_f64_t V_NAME (sinh) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t iax = ix & AbsMask;
+  v_f64_t ax = v_as_f64_u64 (iax);
+  v_u64_t sign = ix & ~AbsMask;
+  v_f64_t halfsign = v_as_f64_u64 (sign | Half);
+
+#if WANT_SIMD_EXCEPT
+  v_u64_t special = v_cond_u64 ((iax - TinyBound) >= (BigBound - TinyBound));
+#else
+  v_u64_t special = v_cond_u64 (iax >= BigBound);
+#endif
+
+  /* Fall back to scalar variant for all lanes if any of them are special.  */
+  if (unlikely (v_any_u64 (special)))
+    return special_case (x);
+
+  /* Up to the point that expm1 overflows, we can use it to calculate sinh
+     using a slight rearrangement of the definition of sinh. This allows us to
+     retain acceptable accuracy for very small inputs.  */
+  v_f64_t t = expm1_inline (ax);
+  return (t + t / (t + 1)) * halfsign;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, sinh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (sinh), 2.08)
+PL_TEST_EXPECT_FENV (V_NAME (sinh), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (sinh), 0, TinyBound, 1000)
+PL_TEST_INTERVAL (V_NAME (sinh), -0, -TinyBound, 1000)
+PL_TEST_INTERVAL (V_NAME (sinh), TinyBound, BigBound, 500000)
+PL_TEST_INTERVAL (V_NAME (sinh), -TinyBound, -BigBound, 500000)
+PL_TEST_INTERVAL (V_NAME (sinh), BigBound, inf, 1000)
+PL_TEST_INTERVAL (V_NAME (sinh), -BigBound, -inf, 1000)
+#endif
diff --git a/pl/math/v_sinhf_2u3.c b/pl/math/v_sinhf_2u3.c
new file mode 100644
index 0000000..49cf078
--- /dev/null
+++ b/pl/math/v_sinhf_2u3.c
@@ -0,0 +1,69 @@
+/*
+ * Single-precision vector sinh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#include "v_expm1f_inline.h"
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define BigBound                                                               \
+  0x42b0c0a7 /* 0x1.61814ep+6, above which expm1f helper overflows.  */
+#define TinyBound                                                              \
+  0x2fb504f4 /* 0x1.6a09e8p-32, below which expm1f underflows.  */
+
+static NOINLINE VPCS_ATTR v_f32_t
+special_case (v_f32_t x)
+{
+  return v_call_f32 (sinhf, x, x, v_u32 (-1));
+}
+
+/* Approximation for vector single-precision sinh(x) using expm1.
+   sinh(x) = (exp(x) - exp(-x)) / 2.
+   The maximum error is 2.26 ULP:
+   __v_sinhf(0x1.e34a9ep-4) got 0x1.e469ep-4 want 0x1.e469e4p-4.  */
+VPCS_ATTR v_f32_t V_NAME (sinhf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iax = ix & AbsMask;
+  v_f32_t ax = v_as_f32_u32 (iax);
+  v_u32_t sign = ix & ~AbsMask;
+  v_f32_t halfsign = v_as_f32_u32 (sign | Half);
+
+#if WANT_SIMD_EXCEPT
+  v_u32_t special = v_cond_u32 ((iax - TinyBound) >= (BigBound - TinyBound));
+#else
+  v_u32_t special = v_cond_u32 (iax >= BigBound);
+#endif
+
+  /* Fall back to the scalar variant for all lanes if any of them should trigger
+     an exception.  */
+  if (unlikely (v_any_u32 (special)))
+    return special_case (x);
+
+  /* Up to the point that expm1f overflows, we can use it to calculate sinhf
+     using a slight rearrangement of the definition of asinh. This allows us to
+     retain acceptable accuracy for very small inputs.  */
+  v_f32_t t = expm1f_inline (ax);
+  return (t + t / (t + 1)) * halfsign;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, sinh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (sinhf), 1.76)
+PL_TEST_EXPECT_FENV (V_NAME (sinhf), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (sinhf), 0, TinyBound, 1000)
+PL_TEST_INTERVAL (V_NAME (sinhf), -0, -TinyBound, 1000)
+PL_TEST_INTERVAL (V_NAME (sinhf), TinyBound, BigBound, 100000)
+PL_TEST_INTERVAL (V_NAME (sinhf), -TinyBound, -BigBound, 100000)
+PL_TEST_INTERVAL (V_NAME (sinhf), BigBound, inf, 1000)
+PL_TEST_INTERVAL (V_NAME (sinhf), -BigBound, -inf, 1000)
+#endif
diff --git a/pl/math/v_tan_3u5.c b/pl/math/v_tan_3u5.c
new file mode 100644
index 0000000..f87bacc
--- /dev/null
+++ b/pl/math/v_tan_3u5.c
@@ -0,0 +1,102 @@
+/*
+ * Double-precision vector tan(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "estrin.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define MHalfPiHi v_f64 (__v_tan_data.neg_half_pi_hi)
+#define MHalfPiLo v_f64 (__v_tan_data.neg_half_pi_lo)
+#define TwoOverPi v_f64 (0x1.45f306dc9c883p-1)
+#define Shift v_f64 (0x1.8p52)
+#define AbsMask 0x7fffffffffffffff
+#define RangeVal 0x4160000000000000  /* asuint64(2^23).  */
+#define TinyBound 0x3e50000000000000 /* asuint64(2^-26).  */
+#define C(i) v_f64 (__v_tan_data.poly[i])
+
+/* Special cases (fall back to scalar calls).  */
+VPCS_ATTR
+NOINLINE static v_f64_t
+specialcase (v_f64_t x)
+{
+  return v_call_f64 (tan, x, x, v_u64 (-1));
+}
+
+/* Vector approximation for double-precision tan.
+   Maximum measured error is 3.48 ULP:
+   __v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37
+				 want -0x1.f6ccd8ecf7deap+37.   */
+VPCS_ATTR
+v_f64_t V_NAME (tan) (v_f64_t x)
+{
+  v_u64_t iax = v_as_u64_f64 (x) & AbsMask;
+
+  /* Our argument reduction cannot calculate q with sufficient accuracy for very
+     large inputs. Fall back to scalar routine for all lanes if any are too
+     large, or Inf/NaN. If fenv exceptions are expected, also fall back for tiny
+     input to avoid underflow. Note pl does not supply a scalar double-precision
+     tan, so the fallback will be statically linked from the system libm.  */
+#if WANT_SIMD_EXCEPT
+  if (unlikely (v_any_u64 (iax - TinyBound > RangeVal - TinyBound)))
+#else
+  if (unlikely (v_any_u64 (iax > RangeVal)))
+#endif
+    return specialcase (x);
+
+  /* q = nearest integer to 2 * x / pi.  */
+  v_f64_t q = v_fma_f64 (x, TwoOverPi, Shift) - Shift;
+  v_s64_t qi = v_to_s64_f64 (q);
+
+  /* Use q to reduce x to r in [-pi/4, pi/4], by:
+     r = x - q * pi/2, in extended precision.  */
+  v_f64_t r = x;
+  r = v_fma_f64 (q, MHalfPiHi, r);
+  r = v_fma_f64 (q, MHalfPiLo, r);
+  /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
+     formula.  */
+  r = r * 0.5;
+
+  /* Approximate tan(r) using order 8 polynomial.
+     tan(x) is odd, so polynomial has the form:
+     tan(x) ~= x + C0 * x^3 + C1 * x^5 + C3 * x^7 + ...
+     Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ...
+     Then compute the approximation by:
+     tan(r) ~= r + r^3 * (C0 + r^2 * P(r)).  */
+  v_f64_t r2 = r * r, r4 = r2 * r2, r8 = r4 * r4;
+  /* Use offset version of Estrin wrapper to evaluate from C1 onwards.  */
+  v_f64_t p = ESTRIN_7_ (r2, r4, r8, C, 1);
+  p = v_fma_f64 (p, r2, C (0));
+  p = v_fma_f64 (r2, p * r, r);
+
+  /* Recombination uses double-angle formula:
+     tan(2x) = 2 * tan(x) / (1 - (tan(x))^2)
+     and reciprocity around pi/2:
+     tan(x) = 1 / (tan(pi/2 - x))
+     to assemble result using change-of-sign and conditional selection of
+     numerator/denominator, dependent on odd/even-ness of q (hence quadrant). */
+  v_f64_t n = v_fma_f64 (p, p, v_f64 (-1));
+  v_f64_t d = p * 2;
+
+  v_u64_t use_recip = v_cond_u64 ((v_as_u64_s64 (qi) & 1) == 0);
+
+  return v_sel_f64 (use_recip, -d, n) / v_sel_f64 (use_recip, n, d);
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, tan, -3.1, 3.1)
+PL_TEST_ULP (V_NAME (tan), 2.99)
+PL_TEST_EXPECT_FENV (V_NAME (tan), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (tan), 0, TinyBound, 5000)
+PL_TEST_INTERVAL (V_NAME (tan), TinyBound, RangeVal, 100000)
+PL_TEST_INTERVAL (V_NAME (tan), RangeVal, inf, 5000)
+PL_TEST_INTERVAL (V_NAME (tan), -0, -TinyBound, 5000)
+PL_TEST_INTERVAL (V_NAME (tan), -TinyBound, -RangeVal, 100000)
+PL_TEST_INTERVAL (V_NAME (tan), -RangeVal, -inf, 5000)
+#endif
diff --git a/pl/math/v_tan_data.c b/pl/math/v_tan_data.c
new file mode 100644
index 0000000..04e2516
--- /dev/null
+++ b/pl/math/v_tan_data.c
@@ -0,0 +1,15 @@
+/*
+ * Coefficients and helpers for double-precision vector tan(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "math_config.h"
+
+const struct v_tan_data __v_tan_data
+  = {.neg_half_pi_hi = -0x1.921fb54442d18p0,
+     .neg_half_pi_lo = -0x1.1a62633145c07p-54,
+     .poly
+     = {0x1.5555555555556p-2, 0x1.1111111110a63p-3, 0x1.ba1ba1bb46414p-5,
+	0x1.664f47e5b5445p-6, 0x1.226e5e5ecdfa3p-7, 0x1.d6c7ddbf87047p-9,
+	0x1.7ea75d05b583ep-10, 0x1.289f22964a03cp-11, 0x1.4e4fd14147622p-12}};
diff --git a/pl/math/v_tanf_3u5.c b/pl/math/v_tanf_3u5.c
new file mode 100644
index 0000000..828466b
--- /dev/null
+++ b/pl/math/v_tanf_3u5.c
@@ -0,0 +1,131 @@
+/*
+ * Single-precision vector tan(x) function.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "estrinf.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+/* Constants.  */
+#define NegPio2_1 (v_f32 (-0x1.921fb6p+0f))
+#define NegPio2_2 (v_f32 (0x1.777a5cp-25f))
+#define NegPio2_3 (v_f32 (0x1.ee59dap-50f))
+#define InvPio2 (v_f32 (0x1.45f306p-1f))
+#define RangeVal (0x47000000)  /* asuint32(0x1p15f).  */
+#define TinyBound (0x30000000) /* asuint32 (0x1p-31).  */
+#define Shift (v_f32 (0x1.8p+23f))
+#define AbsMask (v_u32 (0x7fffffff))
+
+#define poly(i) v_f32 (__tanf_poly_data.poly_tan[i])
+
+/* Special cases (fall back to scalar calls).  */
+VPCS_ATTR
+NOINLINE static v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+  return v_call_f32 (tanf, x, y, cmp);
+}
+
+/* Use a full Estrin scheme to evaluate polynomial.  */
+static inline v_f32_t
+eval_poly (v_f32_t z)
+{
+  v_f32_t z2 = z * z;
+#if WANT_SIMD_EXCEPT
+  /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. If fp exceptions
+     are to be triggered correctly, sidestep this by fixing such lanes to 0.  */
+  v_u32_t will_uflow = v_cond_u32 ((v_as_u32_f32 (z) & AbsMask) <= TinyBound);
+  if (unlikely (v_any_u32 (will_uflow)))
+    z2 = v_sel_f32 (will_uflow, v_f32 (0), z2);
+#endif
+  v_f32_t z4 = z2 * z2;
+  return ESTRIN_5 (z, z2, z4, poly);
+}
+
+/* Fast implementation of Neon tanf.
+   Maximum error is 3.45 ULP:
+   __v_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1
+			    want 0x1.ff9850p-1.  */
+VPCS_ATTR
+v_f32_t V_NAME (tanf) (v_f32_t x)
+{
+  v_f32_t special_arg = x;
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iax = ix & AbsMask;
+
+  /* iax >= RangeVal means x, if not inf or NaN, is too large to perform fast
+     regression.  */
+#if WANT_SIMD_EXCEPT
+  /* If fp exceptions are to be triggered correctly, also special-case tiny
+     input, as this will load to overflow later. Fix any special lanes to 1 to
+     prevent any exceptions being triggered.  */
+  v_u32_t special = v_cond_u32 (iax - TinyBound >= RangeVal - TinyBound);
+  if (unlikely (v_any_u32 (special)))
+    x = v_sel_f32 (special, v_f32 (1.0f), x);
+#else
+  /* Otherwise, special-case large and special values.  */
+  v_u32_t special = v_cond_u32 (iax >= RangeVal);
+#endif
+
+  /* n = rint(x/(pi/2)).  */
+  v_f32_t q = v_fma_f32 (InvPio2, x, Shift);
+  v_f32_t n = q - Shift;
+  /* n is representable as a signed integer, simply convert it.  */
+  v_s32_t in = v_round_s32 (n);
+  /* Determine if x lives in an interval, where |tan(x)| grows to infinity.  */
+  v_s32_t alt = in & 1;
+  v_u32_t pred_alt = (alt != 0);
+
+  /* r = x - n * (pi/2)  (range reduction into -pi./4 .. pi/4).  */
+  v_f32_t r;
+  r = v_fma_f32 (NegPio2_1, n, x);
+  r = v_fma_f32 (NegPio2_2, n, r);
+  r = v_fma_f32 (NegPio2_3, n, r);
+
+  /* If x lives in an interval, where |tan(x)|
+     - is finite, then use a polynomial approximation of the form
+       tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2).
+     - grows to infinity then use symmetries of tangent and the identity
+       tan(r) = cotan(pi/2 - r) to express tan(x) as 1/tan(-r). Finally, use
+       the same polynomial approximation of tan as above.  */
+
+  /* Perform additional reduction if required.  */
+  v_f32_t z = v_sel_f32 (pred_alt, -r, r);
+
+  /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4].  */
+  v_f32_t z2 = r * r;
+  v_f32_t p = eval_poly (z2);
+  v_f32_t y = v_fma_f32 (z * z2, p, z);
+
+  /* Compute reciprocal and apply if required.  */
+  v_f32_t inv_y = v_div_f32 (v_f32 (1.0f), y);
+  y = v_sel_f32 (pred_alt, inv_y, y);
+
+  /* Fast reduction does not handle the x = -0.0 case well,
+     therefore it is fixed here.  */
+  y = v_sel_f32 (x == v_f32 (-0.0), x, y);
+
+  if (unlikely (v_any_u32 (special)))
+    return specialcase (special_arg, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, tan, -3.1, 3.1)
+PL_TEST_ULP (V_NAME (tanf), 2.96)
+PL_TEST_EXPECT_FENV (V_NAME (tanf), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (tanf), -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-23, 0.7, 50000)
+PL_TEST_INTERVAL (V_NAME (tanf), 0.7, 1.5, 50000)
+PL_TEST_INTERVAL (V_NAME (tanf), 1.5, 100, 50000)
+PL_TEST_INTERVAL (V_NAME (tanf), 100, 0x1p17, 50000)
+PL_TEST_INTERVAL (V_NAME (tanf), 0x1p17, inf, 50000)
+#endif
diff --git a/pl/math/v_tanh_3u.c b/pl/math/v_tanh_3u.c
new file mode 100644
index 0000000..c8b6c25
--- /dev/null
+++ b/pl/math/v_tanh_3u.c
@@ -0,0 +1,94 @@
+/*
+ * Double-precision vector tanh(x) function.
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "estrin.h"
+#include "mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#define AbsMask v_u64 (0x7fffffffffffffff)
+#define InvLn2 v_f64 (0x1.71547652b82fep0)
+#define MLn2hi v_f64 (-0x1.62e42fefa39efp-1)
+#define MLn2lo v_f64 (-0x1.abc9e3b39803fp-56)
+#define Shift v_f64 (0x1.8p52)
+#define C(i) v_f64 (__expm1_poly[i])
+
+#define BoringBound 0x403241bf835f9d5f /* asuint64 (0x1.241bf835f9d5fp+4).  */
+#define TinyBound 0x3e40000000000000   /* asuint64 (0x1p-27).  */
+#define One v_u64 (0x3ff0000000000000)
+
+static inline v_f64_t
+expm1_inline (v_f64_t x)
+{
+  /* Helper routine for calculating exp(x) - 1. Vector port of the helper from
+     the scalar variant of tanh.  */
+
+  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+  v_f64_t j = v_fma_f64 (InvLn2, x, Shift) - Shift;
+  v_s64_t i = v_to_s64_f64 (j);
+  v_f64_t f = v_fma_f64 (j, MLn2hi, x);
+  f = v_fma_f64 (j, MLn2lo, f);
+
+  /* Approximate expm1(f) using polynomial.  */
+  v_f64_t f2 = f * f;
+  v_f64_t f4 = f2 * f2;
+  v_f64_t p = v_fma_f64 (f2, ESTRIN_10 (f, f2, f4, f4 * f4, C), f);
+
+  /* t = 2 ^ i.  */
+  v_f64_t t = v_as_f64_u64 (v_as_u64_s64 (i << 52) + One);
+  /* expm1(x) = p * t + (t - 1).  */
+  return v_fma_f64 (p, t, t - 1);
+}
+
+static NOINLINE v_f64_t
+special_case (v_f64_t x, v_f64_t y, v_u64_t special)
+{
+  return v_call_f64 (tanh, x, y, special);
+}
+
+/* Vector approximation for double-precision tanh(x), using a simplified
+   version of expm1. The greatest observed error is 2.75 ULP:
+   __v_tanh(-0x1.c143c3a44e087p-3) got -0x1.ba31ba4691ab7p-3
+				  want -0x1.ba31ba4691ab4p-3.  */
+VPCS_ATTR v_f64_t V_NAME (tanh) (v_f64_t x)
+{
+  v_u64_t ix = v_as_u64_f64 (x);
+  v_u64_t ia = ix & AbsMask;
+
+  /* Trigger special-cases for tiny, boring and infinity/NaN.  */
+  v_u64_t special = v_cond_u64 ((ia - TinyBound) > (BoringBound - TinyBound));
+  v_f64_t u;
+
+  /* To trigger fp exceptions correctly, set special lanes to a neutral value.
+     They will be fixed up later by the special-case handler.  */
+  if (unlikely (v_any_u64 (special)))
+    u = v_sel_f64 (special, v_f64 (1), x) * 2;
+  else
+    u = x * 2;
+
+  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+  v_f64_t q = expm1_inline (u);
+  v_f64_t y = q / (q + 2);
+
+  if (unlikely (v_any_u64 (special)))
+    return special_case (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, D, 1, tanh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (tanh), 2.26)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (tanh))
+PL_TEST_INTERVAL (V_NAME (tanh), 0, TinyBound, 1000)
+PL_TEST_INTERVAL (V_NAME (tanh), -0, -TinyBound, 1000)
+PL_TEST_INTERVAL (V_NAME (tanh), TinyBound, BoringBound, 100000)
+PL_TEST_INTERVAL (V_NAME (tanh), -TinyBound, -BoringBound, 100000)
+PL_TEST_INTERVAL (V_NAME (tanh), BoringBound, inf, 1000)
+PL_TEST_INTERVAL (V_NAME (tanh), -BoringBound, -inf, 1000)
+#endif
diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
new file mode 100644
index 0000000..3616611
--- /dev/null
+++ b/pl/math/v_tanhf_2u6.c
@@ -0,0 +1,69 @@
+/*
+ * Single-precision vector tanh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if V_SUPPORTED
+
+#include "v_expm1f_inline.h"
+
+#define BoringBound                                                            \
+  0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for        \
+		negative).  */
+#define AbsMask 0x7fffffff
+
+static NOINLINE v_f32_t
+special_case (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+  return v_call_f32 (tanhf, x, y, special);
+}
+
+/* Approximation for single-precision vector tanh(x), using a simplified version
+   of expm1f. The maximum error is 2.58 ULP:
+   __v_tanhf(0x1.fa5eep-5) got 0x1.f9ba02p-5
+			  want 0x1.f9ba08p-5.  */
+VPCS_ATTR v_f32_t V_NAME (tanhf) (v_f32_t x)
+{
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iax = ix & AbsMask;
+  v_u32_t sign = ix & ~AbsMask;
+  v_u32_t is_boring = v_cond_u32 (iax > BoringBound);
+  v_f32_t boring = v_as_f32_u32 (sign | One);
+
+#if WANT_SIMD_EXCEPT
+  /* If fp exceptions are to be triggered properly, set all special and boring
+     lanes to 1, which will trigger no exceptions, and fix them up later.  */
+  v_u32_t special = v_cond_u32 ((iax > 0x7f800000) | (iax < 0x34000000));
+  ix = v_sel_u32 (is_boring, v_u32 (One), ix);
+  if (unlikely (v_any_u32 (special)))
+    ix = v_sel_u32 (special, v_u32 (One), ix);
+#else
+  v_u32_t special = v_cond_u32 ((iax > 0x7f800000) | (iax == 0));
+#endif
+
+  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+  v_f32_t q = expm1f_inline (2 * v_as_f32_u32 (ix));
+  v_f32_t y = q / (q + 2);
+  y = v_sel_f32 (is_boring, boring, y);
+  if (unlikely (v_any_u32 (special)))
+    return special_case (x, y, special);
+  return y;
+}
+VPCS_ALIAS
+
+PL_SIG (V, F, 1, tanh, -10.0, 10.0)
+PL_TEST_ULP (V_NAME (tanhf), 2.09)
+PL_TEST_EXPECT_FENV (V_NAME (tanhf), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (tanhf), 0, 0x1p-23, 1000)
+PL_TEST_INTERVAL (V_NAME (tanhf), -0, -0x1p-23, 1000)
+PL_TEST_INTERVAL (V_NAME (tanhf), 0x1p-23, 0x1.205966p+3, 100000)
+PL_TEST_INTERVAL (V_NAME (tanhf), -0x1p-23, -0x1.205966p+3, 100000)
+PL_TEST_INTERVAL (V_NAME (tanhf), 0x1.205966p+3, inf, 100)
+PL_TEST_INTERVAL (V_NAME (tanhf), -0x1.205966p+3, -inf, 100)
+#endif
diff --git a/pl/math/vn_acosh_3u5.c b/pl/math/vn_acosh_3u5.c
new file mode 100644
index 0000000..649735b
--- /dev/null
+++ b/pl/math/vn_acosh_3u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_acosh.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_acosh, _ZGVnN2v_acosh)
+#include "v_acosh_3u5.c"
+#endif
diff --git a/pl/math/vn_acoshf_3u1.c b/pl/math/vn_acoshf_3u1.c
new file mode 100644
index 0000000..8c5f106
--- /dev/null
+++ b/pl/math/vn_acoshf_3u1.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_acoshf.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_acoshf, _ZGVnN4v_acoshf)
+#include "v_acoshf_3u1.c"
+#endif
diff --git a/pl/math/vn_asinh_3u5.c b/pl/math/vn_asinh_3u5.c
new file mode 100644
index 0000000..0d2373b
--- /dev/null
+++ b/pl/math/vn_asinh_3u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_asinh.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_asinh, _ZGVnN2v_asinh)
+#include "v_asinh_3u5.c"
+#endif
diff --git a/pl/math/vn_asinhf_2u7.c b/pl/math/vn_asinhf_2u7.c
new file mode 100644
index 0000000..6c8927f
--- /dev/null
+++ b/pl/math/vn_asinhf_2u7.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_asinhf.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_asinhf, _ZGVnN4v_asinhf)
+#include "v_asinhf_2u7.c"
+#endif
diff --git a/pl/math/vn_atan2_3u.c b/pl/math/vn_atan2_3u.c
new file mode 100644
index 0000000..925b5b4
--- /dev/null
+++ b/pl/math/vn_atan2_3u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atan2.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_atan2, _ZGVnN2vv_atan2)
+#include "v_atan2_3u.c"
+#endif
diff --git a/pl/math/vn_atan2f_3u.c b/pl/math/vn_atan2f_3u.c
new file mode 100644
index 0000000..51d33d5
--- /dev/null
+++ b/pl/math/vn_atan2f_3u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atan2f.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_atan2f, _ZGVnN4vv_atan2f)
+#include "v_atan2f_3u.c"
+#endif
diff --git a/pl/math/vn_atan_2u5.c b/pl/math/vn_atan_2u5.c
new file mode 100644
index 0000000..ccebce2
--- /dev/null
+++ b/pl/math/vn_atan_2u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atan.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_atan, _ZGVnN2v_atan)
+#include "v_atan_2u5.c"
+#endif
diff --git a/pl/math/vn_atanf_3u.c b/pl/math/vn_atanf_3u.c
new file mode 100644
index 0000000..b879727
--- /dev/null
+++ b/pl/math/vn_atanf_3u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atanf.
+ *
+ * Copyright (c) 2021-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_atanf, _ZGVnN4v_atanf)
+#include "v_atanf_3u.c"
+#endif
diff --git a/pl/math/vn_atanh_3u5.c b/pl/math/vn_atanh_3u5.c
new file mode 100644
index 0000000..19429b2
--- /dev/null
+++ b/pl/math/vn_atanh_3u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atanh.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_atanh, _ZGVnN2v_atanh)
+#include "v_atanh_3u5.c"
+#endif
diff --git a/pl/math/vn_atanhf_3u1.c b/pl/math/vn_atanhf_3u1.c
new file mode 100644
index 0000000..7de226d
--- /dev/null
+++ b/pl/math/vn_atanhf_3u1.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atanhf.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_atanhf, _ZGVnN4v_atanhf)
+#include "v_atanhf_3u1.c"
+#endif
diff --git a/pl/math/vn_cbrt_2u.c b/pl/math/vn_cbrt_2u.c
new file mode 100644
index 0000000..4cb0dc8
--- /dev/null
+++ b/pl/math/vn_cbrt_2u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_cbrt.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_cbrt, _ZGVnN2v_cbrt)
+#include "v_cbrt_2u.c"
+#endif
diff --git a/pl/math/vn_cbrtf_1u5.c b/pl/math/vn_cbrtf_1u5.c
new file mode 100644
index 0000000..40a72d8
--- /dev/null
+++ b/pl/math/vn_cbrtf_1u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_cbrtf.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_cbrtf, _ZGVnN4v_cbrtf)
+#include "v_cbrtf_1u5.c"
+#endif
diff --git a/pl/math/vn_cosh_2u.c b/pl/math/vn_cosh_2u.c
new file mode 100644
index 0000000..9bf7f02
--- /dev/null
+++ b/pl/math/vn_cosh_2u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_cosh.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_cosh, _ZGVnN2v_cosh)
+#include "v_cosh_2u.c"
+#endif
diff --git a/pl/math/vn_coshf_2u4.c b/pl/math/vn_coshf_2u4.c
new file mode 100644
index 0000000..b149cb3
--- /dev/null
+++ b/pl/math/vn_coshf_2u4.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_coshf.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_coshf, _ZGVnN4v_coshf)
+#include "v_coshf_2u4.c"
+#endif
diff --git a/pl/math/vn_erf_2u.c b/pl/math/vn_erf_2u.c
new file mode 100644
index 0000000..95bd141
--- /dev/null
+++ b/pl/math/vn_erf_2u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_erf.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_erf, _ZGVnN2v_erf)
+#include "v_erf_2u.c"
+#endif
diff --git a/pl/math/vn_erfc_4u.c b/pl/math/vn_erfc_4u.c
new file mode 100644
index 0000000..1cf6546
--- /dev/null
+++ b/pl/math/vn_erfc_4u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_erfc.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_erfc, _ZGVnN2v_erfc)
+#include "v_erfc_4u.c"
+#endif
diff --git a/pl/math/vn_erfcf_1u.c b/pl/math/vn_erfcf_1u.c
new file mode 100644
index 0000000..ef5a21d
--- /dev/null
+++ b/pl/math/vn_erfcf_1u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_erfcf.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_erfcf, _ZGVnN4v_erfcf)
+#include "v_erfcf_1u.c"
+#endif
diff --git a/pl/math/vn_erff_1u5.c b/pl/math/vn_erff_1u5.c
new file mode 100644
index 0000000..ee8848e
--- /dev/null
+++ b/pl/math/vn_erff_1u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_erff.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_erff, _ZGVnN4v_erff)
+#include "v_erff_1u5.c"
+#endif
diff --git a/pl/math/vn_exp_tail.c b/pl/math/vn_exp_tail.c
new file mode 100644
index 0000000..52a57fe
--- /dev/null
+++ b/pl/math/vn_exp_tail.c
@@ -0,0 +1,11 @@
+/*
+ * AdvSIMD vector PCS variant of __v_erfc.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#include "v_exp_tail.c"
+#endif
diff --git a/pl/math/vn_expf.c b/pl/math/vn_expf.c
new file mode 100644
index 0000000..83e7f0a
--- /dev/null
+++ b/pl/math/vn_expf.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_expf.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_expf, _ZGVnN4v_expf)
+#include "v_expf.c"
+#endif
diff --git a/pl/math/vn_expm1_2u5.c b/pl/math/vn_expm1_2u5.c
new file mode 100644
index 0000000..35111e2
--- /dev/null
+++ b/pl/math/vn_expm1_2u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_expm1.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_expm1, _ZGVnN2v_expm1)
+#include "v_expm1_2u5.c"
+#endif
diff --git a/pl/math/vn_expm1f_1u6.c b/pl/math/vn_expm1f_1u6.c
new file mode 100644
index 0000000..bea491f
--- /dev/null
+++ b/pl/math/vn_expm1f_1u6.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_expm1f.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_expm1f, _ZGVnN4v_expm1f)
+#include "v_expm1f_1u6.c"
+#endif
diff --git a/pl/math/vn_log10_2u5.c b/pl/math/vn_log10_2u5.c
new file mode 100644
index 0000000..5f32c33
--- /dev/null
+++ b/pl/math/vn_log10_2u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_log10.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_log10, _ZGVnN2v_log10)
+#include "v_log10_2u5.c"
+#endif
diff --git a/pl/math/vn_log10f_3u5.c b/pl/math/vn_log10f_3u5.c
new file mode 100644
index 0000000..2673ef5
--- /dev/null
+++ b/pl/math/vn_log10f_3u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_log10f.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_log10f, _ZGVnN4v_log10f)
+#include "v_log10f_3u5.c"
+#endif
diff --git a/pl/math/vn_log1p_2u5.c b/pl/math/vn_log1p_2u5.c
new file mode 100644
index 0000000..3f4f8d1
--- /dev/null
+++ b/pl/math/vn_log1p_2u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_log1p.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_log1p, _ZGVnN2v_log1p)
+#include "v_log1p_2u5.c"
+#endif
diff --git a/pl/math/vn_log1pf_2u1.c b/pl/math/vn_log1pf_2u1.c
new file mode 100644
index 0000000..a319bc9
--- /dev/null
+++ b/pl/math/vn_log1pf_2u1.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_log1pf.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_log1pf, _ZGVnN4v_log1pf)
+#include "v_log1pf_2u1.c"
+#endif
diff --git a/pl/math/vn_log2_3u.c b/pl/math/vn_log2_3u.c
new file mode 100644
index 0000000..a870392
--- /dev/null
+++ b/pl/math/vn_log2_3u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_log2.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_log2, _ZGVnN2v_log2)
+#include "v_log2_3u.c"
+#endif
diff --git a/pl/math/vn_log2f_2u5.c b/pl/math/vn_log2f_2u5.c
new file mode 100644
index 0000000..b4a9cb7
--- /dev/null
+++ b/pl/math/vn_log2f_2u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_log2f.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_log2f, _ZGVnN4v_log2f)
+#include "v_log2f_2u5.c"
+#endif
diff --git a/pl/math/vn_sinh_3u.c b/pl/math/vn_sinh_3u.c
new file mode 100644
index 0000000..7c881de
--- /dev/null
+++ b/pl/math/vn_sinh_3u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_sinh.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_sinh, _ZGVnN2v_sinh)
+#include "v_sinh_3u.c"
+#endif
diff --git a/pl/math/vn_sinhf_2u3.c b/pl/math/vn_sinhf_2u3.c
new file mode 100644
index 0000000..251e732
--- /dev/null
+++ b/pl/math/vn_sinhf_2u3.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_sinhf.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_sinhf, _ZGVnN4v_sinhf)
+#include "v_sinhf_2u3.c"
+#endif
diff --git a/pl/math/vn_tan_3u5.c b/pl/math/vn_tan_3u5.c
new file mode 100644
index 0000000..a4efb06
--- /dev/null
+++ b/pl/math/vn_tan_3u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_tan.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_tan, _ZGVnN2v_tan)
+#include "v_tan_3u5.c"
+#endif
diff --git a/pl/math/vn_tanf_3u5.c b/pl/math/vn_tanf_3u5.c
new file mode 100644
index 0000000..a88cb40
--- /dev/null
+++ b/pl/math/vn_tanf_3u5.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_tanf.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_tanf, _ZGVnN4v_tanf)
+#include "v_tanf_3u5.c"
+#endif
diff --git a/pl/math/vn_tanh_3u.c b/pl/math/vn_tanh_3u.c
new file mode 100644
index 0000000..cb2746c
--- /dev/null
+++ b/pl/math/vn_tanh_3u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_tanh.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_tanh, _ZGVnN2v_tanh)
+#include "v_tanh_3u.c"
+#endif
diff --git a/pl/math/vn_tanhf_2u6.c b/pl/math/vn_tanhf_2u6.c
new file mode 100644
index 0000000..47f0a7f
--- /dev/null
+++ b/pl/math/vn_tanhf_2u6.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_tanhf.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS PL_ALIAS (__vn_tanhf, _ZGVnN4v_tanhf)
+#include "v_tanhf_2u6.c"
+#endif
diff --git a/string/Dir.mk b/string/Dir.mk
index cf3453f..40ff5ac 100644
--- a/string/Dir.mk
+++ b/string/Dir.mk
@@ -1,7 +1,7 @@
 # Makefile fragment - requires GNU make
 #
 # Copyright (c) 2019-2021, Arm Limited.
-# SPDX-License-Identifier: MIT
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 S := $(srcdir)/string
 B := build/string
diff --git a/string/README.contributors b/string/README.contributors
new file mode 100644
index 0000000..0b4a51b
--- /dev/null
+++ b/string/README.contributors
@@ -0,0 +1,30 @@
+STYLE REQUIREMENTS
+==================
+
+1. Most code in this sub-directory is expected to be upstreamed into glibc so
+   the GNU Coding Standard and glibc specific conventions should be followed
+   to ease upstreaming.
+
+2. ABI and symbols: the code should be written so it is suitable for inclusion
+   into a libc with minimal changes. This e.g. means that internal symbols
+   should be hidden and in the implementation reserved namespace according to
+   ISO C and POSIX rules. If possible the built shared libraries and static
+   library archives should be usable to override libc symbols at link time (or
+   at runtime via LD_PRELOAD). This requires the symbols to follow the glibc ABI
+   (other than symbol versioning), this cannot be done reliably for static
+   linking so this is a best effort requirement.
+
+3. API: include headers should be suitable for benchmarking and testing code
+   and should not conflict with libc headers.
+
+
+CONTRIBUTION GUIDELINES FOR string SUB-DIRECTORY
+================================================
+1. Code:
+   - The assumptions of the code must be clearly documented.
+
+   - Assembly style should be consistent across different implementations.
+
+
+2. Performance:
+   - Benchmarking is needed on several microarchitectures.
diff --git a/string/aarch64/__mtag_tag_region.S b/string/aarch64/__mtag_tag_region.S
index 84339f7..207e229 100644
--- a/string/aarch64/__mtag_tag_region.S
+++ b/string/aarch64/__mtag_tag_region.S
@@ -1,8 +1,8 @@
 /*
  * __mtag_tag_region - tag memory
  *
- * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -15,7 +15,7 @@
  * The memory region may remain untagged if tagging is not enabled.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_MEMORY_TAGGING
 
diff --git a/string/aarch64/__mtag_tag_zero_region.S b/string/aarch64/__mtag_tag_zero_region.S
index f58364c..44b8e01 100644
--- a/string/aarch64/__mtag_tag_zero_region.S
+++ b/string/aarch64/__mtag_tag_zero_region.S
@@ -1,8 +1,8 @@
 /*
  * __mtag_tag_zero_region - tag memory and fill it with zero bytes
  *
- * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -15,7 +15,7 @@
  * The memory region may remain untagged if tagging is not enabled.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_MEMORY_TAGGING
 
diff --git a/string/asmdefs.h b/string/aarch64/asmdefs.h
similarity index 83%
rename from string/asmdefs.h
rename to string/aarch64/asmdefs.h
index 340b427..069b146 100644
--- a/string/asmdefs.h
+++ b/string/aarch64/asmdefs.h
@@ -1,15 +1,13 @@
 /*
- * Macros for asm code.
+ * Macros for asm code.  AArch64 version.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _ASMDEFS_H
 #define _ASMDEFS_H
 
-#if defined(__aarch64__)
-
 /* Branch Target Identitication support.  */
 #define BTI_C		hint	34
 #define BTI_J		hint	36
@@ -55,19 +53,6 @@
   .cfi_startproc;	\
   BTI_C;
 
-#else
-
-#define END_FILE
-
-#define ENTRY_ALIGN(name, alignment)	\
-  .global name;		\
-  .type name,%function;	\
-  .align alignment;		\
-  name:			\
-  .cfi_startproc;
-
-#endif
-
 #define ENTRY(name)	ENTRY_ALIGN(name, 6)
 
 #define ENTRY_ALIAS(name)	\
@@ -95,4 +80,13 @@
 #define SIZE_ARG(n)
 #endif
 
+/* Compiler supports SVE instructions  */
+#ifndef HAVE_SVE
+# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5)
+#   define HAVE_SVE 1
+# else
+#   define HAVE_SVE 0
+# endif
+#endif
+
 #endif
diff --git a/string/aarch64/check-arch.S b/string/aarch64/check-arch.S
index 5a54242..131b7fa 100644
--- a/string/aarch64/check-arch.S
+++ b/string/aarch64/check-arch.S
@@ -1,8 +1,8 @@
 /*
  * check ARCH setting.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if !__aarch64__
@@ -10,4 +10,4 @@
 #endif
 
 /* Include for GNU property notes.  */
-#include "../asmdefs.h"
+#include "asmdefs.h"
diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S
index c2e967d..948c3cb 100644
--- a/string/aarch64/memchr-mte.S
+++ b/string/aarch64/memchr-mte.S
@@ -1,8 +1,8 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
@@ -23,25 +23,21 @@
 #define synd		x5
 #define shift		x6
 #define	tmp		x7
-#define wtmp		w7
 
 #define vrepchr		v0
 #define qdata		q1
 #define vdata		v1
 #define vhas_chr	v2
-#define vrepmask	v3
-#define vend		v4
-#define dend		d4
+#define vend		v3
+#define dend		d3
 
 /*
    Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__memchr_aarch64_mte)
 	PTR_ARG (0)
@@ -50,55 +46,53 @@
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0xf00f
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	lsl	shift, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
 
 	rbit	synd, synd
 	clz	synd, synd
-	add	result, srcin, synd, lsr 2
 	cmp	cntin, synd, lsr 2
+	add	result, srcin, synd, lsr 2
 	csel	result, result, xzr, hi
 	ret
 
+	.p2align 3
 L(start_loop):
 	sub	tmp, src, srcin
-	add	tmp, tmp, 16
+	add	tmp, tmp, 17
 	subs	cntrem, cntin, tmp
-	b.ls	L(nomatch)
+	b.lo	L(nomatch)
 
 	/* Make sure that it won't overread by a 16-byte chunk */
-	add	tmp, cntrem, 15
-	tbnz	tmp, 4, L(loop32_2)
-
+	tbz	cntrem, 4, L(loop32_2)
+	sub	src, src, 16
 	.p2align 4
 L(loop32):
-	ldr	qdata, [src, 16]!
+	ldr	qdata, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbnz	synd, L(end)
 
 L(loop32_2):
-	ldr	qdata, [src, 16]!
-	subs	cntrem, cntrem, 32
+	ldr	qdata, [src, 16]
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	b.ls	L(end)
+	subs	cntrem, cntrem, 32
+	b.lo	L(end_2)
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(loop32)
+L(end_2):
+	add	src, src, 16
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
+	sub	cntrem, src, srcin
 	fmov	synd, dend
-	add	tmp, srcin, cntin
-	sub	cntrem, tmp, src
+	sub	cntrem, cntin, cntrem
 #ifndef __AARCH64EB__
 	rbit	synd, synd
 #endif
diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S
index c22e659..b851cf3 100644
--- a/string/aarch64/memchr-sve.S
+++ b/string/aarch64/memchr-sve.S
@@ -1,11 +1,11 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
index 353f0d1..fe6cfe2 100644
--- a/string/aarch64/memchr.S
+++ b/string/aarch64/memchr.S
@@ -1,8 +1,8 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * Neon Available.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 /* Arguments and results.  */
 #define srcin		x0
diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S
index 78c5eca..d52ce45 100644
--- a/string/aarch64/memcmp-sve.S
+++ b/string/aarch64/memcmp-sve.S
@@ -1,11 +1,11 @@
 /*
  * memcmp - compare memory
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
index 3b10266..35135e7 100644
--- a/string/aarch64/memcmp.S
+++ b/string/aarch64/memcmp.S
@@ -1,103 +1,84 @@
 /* memcmp - compare memory
  *
- * Copyright (c) 2013-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2013-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
-/* Parameters and result.  */
-#define src1		x0
-#define src2		x1
-#define limit		x2
-#define result		w0
+#define src1	x0
+#define src2	x1
+#define limit	x2
+#define result	w0
 
-/* Internal variables.  */
-#define data1		x3
-#define data1w		w3
-#define data1h		x4
-#define data2		x5
-#define data2w		w5
-#define data2h		x6
-#define tmp1		x7
-#define tmp2		x8
+#define data1	x3
+#define data1w	w3
+#define data2	x4
+#define data2w	w4
+#define data3	x5
+#define data3w	w5
+#define data4	x6
+#define data4w	w6
+#define tmp	x6
+#define src1end	x7
+#define src2end	x8
+
 
 ENTRY (__memcmp_aarch64)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	SIZE_ARG (2)
-	subs	limit, limit, 8
-	b.lo	L(less8)
 
-	ldr	data1, [src1], 8
-	ldr	data2, [src2], 8
+	cmp	limit, 16
+	b.lo	L(less16)
+	ldp	data1, data3, [src1]
+	ldp	data2, data4, [src2]
+	ccmp	data1, data2, 0, ne
+	ccmp	data3, data4, 0, eq
+	b.ne	L(return2)
+
+	add	src1end, src1, limit
+	add	src2end, src2, limit
+	cmp	limit, 32
+	b.ls	L(last_bytes)
+	cmp	limit, 160
+	b.hs	L(loop_align)
+	sub	limit, limit, 32
+
+	.p2align 4
+L(loop32):
+	ldp	data1, data3, [src1, 16]
+	ldp	data2, data4, [src2, 16]
 	cmp	data1, data2
-	b.ne	L(return)
-
-	subs	limit, limit, 8
-	b.gt	L(more16)
-
-	ldr	data1, [src1, limit]
-	ldr	data2, [src2, limit]
-	b	L(return)
-
-L(more16):
-	ldr	data1, [src1], 8
-	ldr	data2, [src2], 8
-	cmp	data1, data2
-	bne	L(return)
-
-	/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
-	   strings.  */
-	subs	limit, limit, 16
+	ccmp	data3, data4, 0, eq
+	b.ne	L(return2)
+	cmp	limit, 16
 	b.ls	L(last_bytes)
 
-	/* We overlap loads between 0-32 bytes at either side of SRC1 when we
-	   try to align, so limit it only to strings larger than 128 bytes.  */
-	cmp	limit, 96
-	b.ls	L(loop16)
-
-	/* Align src1 and adjust src2 with bytes not yet done.  */
-	and	tmp1, src1, 15
-	add	limit, limit, tmp1
-	sub	src1, src1, tmp1
-	sub	src2, src2, tmp1
-
-	/* Loop performing 16 bytes per iteration using aligned src1.
-	   Limit is pre-decremented by 16 and must be larger than zero.
-	   Exit if <= 16 bytes left to do or if the data is not equal.  */
-	.p2align 4
-L(loop16):
-	ldp	data1, data1h, [src1], 16
-	ldp	data2, data2h, [src2], 16
-	subs	limit, limit, 16
-	ccmp	data1, data2, 0, hi
-	ccmp	data1h, data2h, 0, eq
-	b.eq	L(loop16)
-
+	ldp	data1, data3, [src1, 32]
+	ldp	data2, data4, [src2, 32]
 	cmp	data1, data2
-	bne	L(return)
-	mov	data1, data1h
-	mov	data2, data2h
-	cmp	data1, data2
-	bne	L(return)
+	ccmp	data3, data4, 0, eq
+	b.ne	L(return2)
+	add	src1, src1, 32
+	add	src2, src2, 32
+L(last64):
+	subs	limit, limit, 32
+	b.hi	L(loop32)
 
 	/* Compare last 1-16 bytes using unaligned access.  */
 L(last_bytes):
-	add	src1, src1, limit
-	add	src2, src2, limit
-	ldp	data1, data1h, [src1]
-	ldp	data2, data2h, [src2]
-	cmp     data1, data2
-	bne	L(return)
-	mov	data1, data1h
-	mov	data2, data2h
+	ldp	data1, data3, [src1end, -16]
+	ldp	data2, data4, [src2end, -16]
+L(return2):
 	cmp	data1, data2
+	csel	data1, data1, data3, ne
+	csel	data2, data2, data4, ne
 
 	/* Compare data bytes and set return value to 0, -1 or 1.  */
 L(return):
@@ -105,33 +86,105 @@
 	rev	data1, data1
 	rev	data2, data2
 #endif
-	cmp     data1, data2
-L(ret_eq):
+	cmp	data1, data2
 	cset	result, ne
 	cneg	result, result, lo
 	ret
 
 	.p2align 4
-	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
+L(less16):
+	add	src1end, src1, limit
+	add	src2end, src2, limit
+	tbz	limit, 3, L(less8)
+	ldr	data1, [src1]
+	ldr	data2, [src2]
+	ldr	data3, [src1end, -8]
+	ldr	data4, [src2end, -8]
+	b	L(return2)
+
+	.p2align 4
 L(less8):
-	adds	limit, limit, 4
-	b.lo	L(less4)
-	ldr	data1w, [src1], 4
-	ldr	data2w, [src2], 4
+	tbz	limit, 2, L(less4)
+	ldr	data1w, [src1]
+	ldr	data2w, [src2]
+	ldr	data3w, [src1end, -4]
+	ldr	data4w, [src2end, -4]
+	b	L(return2)
+
+L(less4):
+	tbz	limit, 1, L(less2)
+	ldrh	data1w, [src1]
+	ldrh	data2w, [src2]
 	cmp	data1w, data2w
 	b.ne	L(return)
-	sub	limit, limit, 4
-L(less4):
-	adds	limit, limit, 4
-	beq	L(ret_eq)
-L(byte_loop):
-	ldrb	data1w, [src1], 1
-	ldrb	data2w, [src2], 1
-	subs	limit, limit, 1
-	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
-	b.eq	L(byte_loop)
+L(less2):
+	mov	result, 0
+	tbz	limit, 0, L(return_zero)
+	ldrb	data1w, [src1end, -1]
+	ldrb	data2w, [src2end, -1]
 	sub	result, data1w, data2w
+L(return_zero):
+	ret
+
+L(loop_align):
+	ldp	data1, data3, [src1, 16]
+	ldp	data2, data4, [src2, 16]
+	cmp	data1, data2
+	ccmp	data3, data4, 0, eq
+	b.ne	L(return2)
+
+	/* Align src2 and adjust src1, src2 and limit.  */
+	and	tmp, src2, 15
+	sub	tmp, tmp, 16
+	sub	src2, src2, tmp
+	add	limit, limit, tmp
+	sub	src1, src1, tmp
+	sub	limit, limit, 64 + 16
+
+	.p2align 4
+L(loop64):
+	ldr	q0, [src1, 16]
+	ldr	q1, [src2, 16]
+	subs	limit, limit, 64
+	ldr	q2, [src1, 32]
+	ldr	q3, [src2, 32]
+	eor	v0.16b, v0.16b, v1.16b
+	eor	v1.16b, v2.16b, v3.16b
+	ldr	q2, [src1, 48]
+	ldr	q3, [src2, 48]
+	umaxp	v0.16b, v0.16b, v1.16b
+	ldr	q4, [src1, 64]!
+	ldr	q5, [src2, 64]!
+	eor	v1.16b, v2.16b, v3.16b
+	eor	v2.16b, v4.16b, v5.16b
+	umaxp	v1.16b, v1.16b, v2.16b
+	umaxp	v0.16b, v0.16b, v1.16b
+	umaxp	v0.16b, v0.16b, v0.16b
+	fmov	tmp, d0
+	ccmp	tmp, 0, 0, hi
+	b.eq	L(loop64)
+
+	/* If equal, process last 1-64 bytes using scalar loop.  */
+	add	limit, limit, 64 + 16
+	cbz	tmp, L(last64)
+
+	/* Determine the 8-byte aligned offset of the first difference.  */
+#ifdef __AARCH64EB__
+	rev16	tmp, tmp
+#endif
+	rev	tmp, tmp
+	clz	tmp, tmp
+	bic	tmp, tmp, 7
+	sub	tmp, tmp, 48
+	ldr	data1, [src1, tmp]
+	ldr	data2, [src2, tmp]
+#ifndef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	mov	result, 1
+	cmp	data1, data2
+	cneg	result, result, lo
 	ret
 
 END (__memcmp_aarch64)
-
diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S
index f97f2c3..e6527d0 100644
--- a/string/aarch64/memcpy-advsimd.S
+++ b/string/aarch64/memcpy-advsimd.S
@@ -1,8 +1,8 @@
 /*
  * memcpy - copy memory area
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  *
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define dstin	x0
 #define src	x1
diff --git a/string/aarch64/memcpy-sve.S b/string/aarch64/memcpy-sve.S
new file mode 100644
index 0000000..e8a946d
--- /dev/null
+++ b/string/aarch64/memcpy-sve.S
@@ -0,0 +1,177 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
+ *
+ */
+
+#include "asmdefs.h"
+
+#ifdef HAVE_SVE
+
+.arch armv8-a+sve
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define tmp1	x6
+#define vlen	x6
+
+#define A_q	q0
+#define B_q	q1
+#define C_q	q2
+#define D_q	q3
+#define E_q	q4
+#define F_q	q5
+#define G_q	q6
+#define H_q	q7
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+   SVE vectors are used to speedup small copies.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The source pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+ENTRY_ALIAS (__memmove_aarch64_sve)
+ENTRY (__memcpy_aarch64_sve)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+
+	cmp	count, 128
+	b.hi	L(copy_long)
+	cntb	vlen
+	cmp	count, vlen, lsl 1
+	b.hi	L(copy32_128)
+
+	whilelo p0.b, xzr, count
+	whilelo p1.b, vlen, count
+	ld1b	z0.b, p0/z, [src, 0, mul vl]
+	ld1b	z1.b, p1/z, [src, 1, mul vl]
+	st1b	z0.b, p0, [dstin, 0, mul vl]
+	st1b	z1.b, p1, [dstin, 1, mul vl]
+	ret
+
+	/* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+	add	srcend, src, count
+	add	dstend, dstin, count
+	ldp	A_q, B_q, [src]
+	ldp	C_q, D_q, [srcend, -32]
+	cmp	count, 64
+	b.hi	L(copy128)
+	stp	A_q, B_q, [dstin]
+	stp	C_q, D_q, [dstend, -32]
+	ret
+
+	/* Copy 65..128 bytes.  */
+L(copy128):
+	ldp	E_q, F_q, [src, 32]
+	cmp	count, 96
+	b.ls	L(copy96)
+	ldp	G_q, H_q, [srcend, -64]
+	stp	G_q, H_q, [dstend, -64]
+L(copy96):
+	stp	A_q, B_q, [dstin]
+	stp	E_q, F_q, [dstin, 32]
+	stp	C_q, D_q, [dstend, -32]
+	ret
+
+	/* Copy more than 128 bytes.  */
+L(copy_long):
+	add	srcend, src, count
+	add	dstend, dstin, count
+
+	/* Use backwards copy if there is an overlap.  */
+	sub	tmp1, dstin, src
+	cmp	tmp1, count
+	b.lo	L(copy_long_backwards)
+
+	/* Copy 16 bytes and then align src to 16-byte alignment.  */
+	ldr	D_q, [src]
+	and	tmp1, src, 15
+	bic	src, src, 15
+	sub	dst, dstin, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_q, B_q, [src, 16]
+	str	D_q, [dstin]
+	ldp	C_q, D_q, [src, 48]
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	L(copy64_from_end)
+L(loop64):
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [src, 80]
+	stp	C_q, D_q, [dst, 48]
+	ldp	C_q, D_q, [src, 112]
+	add	src, src, 64
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	L(loop64)
+
+	/* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+	ldp	E_q, F_q, [srcend, -64]
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [srcend, -32]
+	stp	C_q, D_q, [dst, 48]
+	stp	E_q, F_q, [dstend, -64]
+	stp	A_q, B_q, [dstend, -32]
+	ret
+
+	/* Large backwards copy for overlapping copies.
+	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
+L(copy_long_backwards):
+	cbz	tmp1, L(return)
+	ldr	D_q, [srcend, -16]
+	and	tmp1, srcend, 15
+	bic	srcend, srcend, 15
+	sub	count, count, tmp1
+	ldp	A_q, B_q, [srcend, -32]
+	str	D_q, [dstend, -16]
+	ldp	C_q, D_q, [srcend, -64]
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	L(copy64_from_start)
+
+L(loop64_backwards):
+	str	B_q, [dstend, -16]
+	str	A_q, [dstend, -32]
+	ldp	A_q, B_q, [srcend, -96]
+	str	D_q, [dstend, -48]
+	str	C_q, [dstend, -64]!
+	ldp	C_q, D_q, [srcend, -128]
+	sub	srcend, srcend, 64
+	subs	count, count, 64
+	b.hi	L(loop64_backwards)
+
+	/* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+	ldp	E_q, F_q, [src, 32]
+	stp	A_q, B_q, [dstend, -32]
+	ldp	A_q, B_q, [src]
+	stp	C_q, D_q, [dstend, -64]
+	stp	E_q, F_q, [dstin, 32]
+	stp	A_q, B_q, [dstin]
+L(return):
+	ret
+
+END (__memcpy_aarch64_sve)
+
+#endif
diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S
index dd254f6..7c0606e 100644
--- a/string/aarch64/memcpy.S
+++ b/string/aarch64/memcpy.S
@@ -1,8 +1,8 @@
 /*
  * memcpy - copy memory area
  *
- * Copyright (c) 2012-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  *
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define dstin	x0
 #define src	x1
diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S
index 7b4be84..6418bdf 100644
--- a/string/aarch64/memrchr.S
+++ b/string/aarch64/memrchr.S
@@ -1,8 +1,8 @@
 /*
  * memrchr - find last character in a memory zone.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
@@ -23,7 +23,6 @@
 #define synd		x5
 #define shift		x6
 #define	tmp		x7
-#define wtmp		w7
 #define end		x8
 #define endm1		x9
 
@@ -31,19 +30,16 @@
 #define qdata		q1
 #define vdata		v1
 #define vhas_chr	v2
-#define vrepmask	v3
-#define vend		v4
-#define dend		d4
+#define vend		v3
+#define dend		d3
 
 /*
    Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__memrchr_aarch64)
 	PTR_ARG (0)
@@ -53,12 +49,9 @@
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0xf00f
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	neg	shift, end, lsl 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsl	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -69,34 +62,36 @@
 	csel	result, result, xzr, hi
 	ret
 
+	nop
 L(start_loop):
-	sub	tmp, end, src
-	subs	cntrem, cntin, tmp
+	subs	cntrem, src, srcin
 	b.ls	L(nomatch)
 
 	/* Make sure that it won't overread by a 16-byte chunk */
-	add	tmp, cntrem, 15
-	tbnz	tmp, 4, L(loop32_2)
+	sub	cntrem, cntrem, 1
+	tbz	cntrem, 4, L(loop32_2)
+	add	src, src, 16
 
-	.p2align 4
+	.p2align 5
 L(loop32):
-	ldr	qdata, [src, -16]!
+	ldr	qdata, [src, -32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbnz	synd, L(end)
 
 L(loop32_2):
-	ldr	qdata, [src, -16]!
+	ldr	qdata, [src, -16]
 	subs	cntrem, cntrem, 32
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	b.ls	L(end)
+	b.lo	L(end_2)
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(loop32)
+L(end_2):
+	sub	src, src, 16
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 
 	add	tmp, src, 15
diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S
index 9fcd975..553b0fc 100644
--- a/string/aarch64/memset.S
+++ b/string/aarch64/memset.S
@@ -1,8 +1,8 @@
 /*
  * memset - fill memory with a constant byte
  *
- * Copyright (c) 2012-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  *
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define dstin	x0
 #define val	x1
diff --git a/string/aarch64/stpcpy-mte.S b/string/aarch64/stpcpy-mte.S
deleted file mode 100644
index f1c7119..0000000
--- a/string/aarch64/stpcpy-mte.S
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * stpcpy - copy a string returning pointer to end.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#define BUILD_STPCPY 1
-
-#include "strcpy-mte.S"
diff --git a/string/aarch64/stpcpy-sve.S b/string/aarch64/stpcpy-sve.S
index 82dd971..5d3f14b 100644
--- a/string/aarch64/stpcpy-sve.S
+++ b/string/aarch64/stpcpy-sve.S
@@ -2,7 +2,7 @@
  * stpcpy - copy a string returning pointer to end.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define BUILD_STPCPY 1
diff --git a/string/aarch64/stpcpy.S b/string/aarch64/stpcpy.S
index 4f62aa4..155c68d 100644
--- a/string/aarch64/stpcpy.S
+++ b/string/aarch64/stpcpy.S
@@ -2,7 +2,7 @@
  * stpcpy - copy a string returning pointer to end.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define BUILD_STPCPY 1
diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S
index dcb0e46..6ec08f7 100644
--- a/string/aarch64/strchr-mte.S
+++ b/string/aarch64/strchr-mte.S
@@ -1,8 +1,8 @@
 /*
  * strchr - find a character in a string
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
@@ -19,8 +19,7 @@
 
 #define src		x2
 #define tmp1		x1
-#define wtmp2		w3
-#define tmp3		x3
+#define tmp2		x3
 
 #define vrepchr		v0
 #define vdata		v1
@@ -28,39 +27,30 @@
 #define vhas_nul	v2
 #define vhas_chr	v3
 #define vrepmask	v4
-#define vrepmask2	v5
-#define vend		v6
-#define dend		d6
+#define vend		v5
+#define dend		d5
 
 /* Core algorithm.
 
    For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-1 are set if the relevant byte matched the
-   requested character, bits 2-3 are set if the byte is NUL (or matched), and
-   bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd
-   bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits
-   in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   per byte. Bits 0-1 are set if the relevant byte matched the requested
+   character, bits 2-3 are set if the byte is NUL or matched. Count trailing
+   zeroes gives the position of the matching byte if it is a multiple of 4.
+   If it is not a multiple of 4, there was no match.  */
 
 ENTRY (__strchr_aarch64_mte)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
-	mov	wtmp2, 0x3003
-	dup	vrepmask.8h, wtmp2
+	movi	vrepmask.16b, 0x33
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	mov	wtmp2, 0xf00f
-	dup	vrepmask2.8h, wtmp2
-
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	lsl	tmp3, srcin, 2
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
-
+	lsl	tmp2, srcin, 2
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
-	lsr	tmp1, tmp1, tmp3
+	lsr	tmp1, tmp1, tmp2
 	cbz	tmp1, L(loop)
 
 	rbit	tmp1, tmp1
@@ -74,28 +64,34 @@
 
 	.p2align 4
 L(loop):
-	ldr	qdata, [src, 16]!
+	ldr	qdata, [src, 16]
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	tmp1, dend
+	cbnz	tmp1, L(end)
+	ldr	qdata, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
+	sub	src, src, 16
+L(end):
 
 #ifdef __AARCH64EB__
 	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 #else
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 	rbit	tmp1, tmp1
 #endif
+	add	src, src, 16
 	clz	tmp1, tmp1
-	/* Tmp1 is an even multiple of 2 if the target character was
-	   found first. Otherwise we've found the end of string.  */
+	/* Tmp1 is a multiple of 4 if the target character was found.  */
 	tst	tmp1, 2
 	add	result, src, tmp1, lsr 2
 	csel	result, result, xzr, eq
diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S
index 13ba9f4..ff07516 100644
--- a/string/aarch64/strchr-sve.S
+++ b/string/aarch64/strchr-sve.S
@@ -1,11 +1,11 @@
 /*
  * strchr/strchrnul - find a character in a string
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
index 1063cbf..37193bd 100644
--- a/string/aarch64/strchr.S
+++ b/string/aarch64/strchr.S
@@ -1,8 +1,8 @@
 /*
  * strchr - find a character in a string
  *
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * Neon Available.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 /* Arguments and results.  */
 #define srcin		x0
diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S
index 1b0d0a6..543ee88 100644
--- a/string/aarch64/strchrnul-mte.S
+++ b/string/aarch64/strchrnul-mte.S
@@ -1,8 +1,8 @@
 /*
  * strchrnul - find a character or nul in a string
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
@@ -20,38 +20,32 @@
 #define src		x2
 #define tmp1		x1
 #define tmp2		x3
-#define tmp2w		w3
 
 #define vrepchr		v0
 #define vdata		v1
 #define qdata		q1
 #define vhas_nul	v2
 #define vhas_chr	v3
-#define vrepmask	v4
-#define vend		v5
-#define dend		d5
+#define vend		v4
+#define dend		d4
 
-/* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__strchrnul_aarch64_mte)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
-	mov	tmp2w, 0xf00f
-	dup	vrepmask.8h, tmp2w
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
 	lsl	tmp2, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 	lsr	tmp1, tmp1, tmp2	/* Mask padding bits.  */
 	cbz	tmp1, L(loop)
@@ -63,15 +57,22 @@
 
 	.p2align 4
 L(loop):
-	ldr	qdata, [src, 16]!
+	ldr	qdata, [src, 16]
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b
+	fmov	tmp1, dend
+	cbnz	tmp1, L(end)
+	ldr	qdata, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b
 	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
-
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	sub	src, src, 16
+L(end):
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
+	add	src, src, 16
 	fmov	tmp1, dend
 #ifndef __AARCH64EB__
 	rbit	tmp1, tmp1
diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/strchrnul-sve.S
index 428ff1a..0005f91 100644
--- a/string/aarch64/strchrnul-sve.S
+++ b/string/aarch64/strchrnul-sve.S
@@ -2,7 +2,7 @@
  * strchrnul - find a character or nul in a string
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define BUILD_STRCHRNUL
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
index a4230d9..666e8d0 100644
--- a/string/aarch64/strchrnul.S
+++ b/string/aarch64/strchrnul.S
@@ -1,8 +1,8 @@
 /*
  * strchrnul - find a character or nul in a string
  *
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * Neon Available.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 /* Arguments and results.  */
 #define srcin		x0
diff --git a/string/aarch64/strcmp-mte.S b/string/aarch64/strcmp-mte.S
deleted file mode 100644
index 12d1a6b..0000000
--- a/string/aarch64/strcmp-mte.S
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * strcmp - compare two strings
- *
- * Copyright (c) 2012-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64.
- * MTE compatible.
- */
-
-#include "../asmdefs.h"
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-
-#define src1		x0
-#define src2		x1
-#define result		x0
-
-#define data1		x2
-#define data1w		w2
-#define data2		x3
-#define data2w		w3
-#define has_nul		x4
-#define diff		x5
-#define off1		x5
-#define syndrome	x6
-#define tmp		x6
-#define data3		x7
-#define zeroones	x8
-#define shift		x9
-#define off2		x10
-
-/* On big-endian early bytes are at MSB and on little-endian LSB.
-   LS_FW means shifting towards early bytes.  */
-#ifdef __AARCH64EB__
-# define LS_FW lsl
-#else
-# define LS_FW lsr
-#endif
-
-/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-   can be done in parallel across the entire word.
-   Since carry propagation makes 0x1 bytes before a NUL byte appear
-   NUL too in big-endian, byte-reverse the data before the NUL check.  */
-
-
-ENTRY (__strcmp_aarch64_mte)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	sub	off2, src2, src1
-	mov	zeroones, REP8_01
-	and	tmp, src1, 7
-	tst	off2, 7
-	b.ne	L(misaligned8)
-	cbnz	tmp, L(mutual_align)
-
-	.p2align 4
-
-L(loop_aligned):
-	ldr	data2, [src1, off2]
-	ldr	data1, [src1], 8
-L(start_realigned):
-#ifdef __AARCH64EB__
-	rev	tmp, data1
-	sub	has_nul, tmp, zeroones
-	orr	tmp, tmp, REP8_7f
-#else
-	sub	has_nul, data1, zeroones
-	orr	tmp, data1, REP8_7f
-#endif
-	bics	has_nul, has_nul, tmp	/* Non-zero if NUL terminator.  */
-	ccmp	data1, data2, 0, eq
-	b.eq	L(loop_aligned)
-#ifdef __AARCH64EB__
-	rev	has_nul, has_nul
-#endif
-	eor	diff, data1, data2
-	orr	syndrome, diff, has_nul
-L(end):
-#ifndef __AARCH64EB__
-	rev	syndrome, syndrome
-	rev	data1, data1
-	rev	data2, data2
-#endif
-	clz	shift, syndrome
-	/* The most-significant-non-zero bit of the syndrome marks either the
-	   first bit that is different, or the top bit of the first zero byte.
-	   Shifting left now will bring the critical information into the
-	   top bits.  */
-	lsl	data1, data1, shift
-	lsl	data2, data2, shift
-	/* But we need to zero-extend (char is unsigned) the value and then
-	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, 56
-	sub	result, data1, data2, lsr 56
-	ret
-
-	.p2align 4
-
-L(mutual_align):
-	/* Sources are mutually aligned, but are not currently at an
-	   alignment boundary.  Round down the addresses and then mask off
-	   the bytes that precede the start point.  */
-	bic	src1, src1, 7
-	ldr	data2, [src1, off2]
-	ldr	data1, [src1], 8
-	neg	shift, src2, lsl 3	/* Bits to alignment -64.  */
-	mov	tmp, -1
-	LS_FW	tmp, tmp, shift
-	orr	data1, data1, tmp
-	orr	data2, data2, tmp
-	b	L(start_realigned)
-
-L(misaligned8):
-	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
-	   checking to make sure that we don't access beyond the end of SRC2.  */
-	cbz	tmp, L(src1_aligned)
-L(do_misaligned):
-	ldrb	data1w, [src1], 1
-	ldrb	data2w, [src2], 1
-	cmp	data1w, 0
-	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
-	b.ne	L(done)
-	tst	src1, 7
-	b.ne	L(do_misaligned)
-
-L(src1_aligned):
-	neg	shift, src2, lsl 3
-	bic	src2, src2, 7
-	ldr	data3, [src2], 8
-#ifdef __AARCH64EB__
-	rev	data3, data3
-#endif
-	lsr	tmp, zeroones, shift
-	orr	data3, data3, tmp
-	sub	has_nul, data3, zeroones
-	orr	tmp, data3, REP8_7f
-	bics	has_nul, has_nul, tmp
-	b.ne	L(tail)
-
-	sub	off1, src2, src1
-
-	.p2align 4
-
-L(loop_unaligned):
-	ldr	data3, [src1, off1]
-	ldr	data2, [src1, off2]
-#ifdef __AARCH64EB__
-	rev	data3, data3
-#endif
-	sub	has_nul, data3, zeroones
-	orr	tmp, data3, REP8_7f
-	ldr	data1, [src1], 8
-	bics	has_nul, has_nul, tmp
-	ccmp	data1, data2, 0, eq
-	b.eq	L(loop_unaligned)
-
-	lsl	tmp, has_nul, shift
-#ifdef __AARCH64EB__
-	rev	tmp, tmp
-#endif
-	eor	diff, data1, data2
-	orr	syndrome, diff, tmp
-	cbnz	syndrome, L(end)
-L(tail):
-	ldr	data1, [src1]
-	neg	shift, shift
-	lsr	data2, data3, shift
-	lsr	has_nul, has_nul, shift
-#ifdef __AARCH64EB__
-	rev     data2, data2
-	rev	has_nul, has_nul
-#endif
-	eor	diff, data1, data2
-	orr	syndrome, diff, has_nul
-	b	L(end)
-
-L(done):
-	sub	result, data1, data2
-	ret
-
-END (__strcmp_aarch64_mte)
-
diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S
index e6d2da5..eaf909a 100644
--- a/string/aarch64/strcmp-sve.S
+++ b/string/aarch64/strcmp-sve.S
@@ -1,11 +1,11 @@
 /*
  * __strcmp_aarch64_sve - compare two strings
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S
index 7714ebf..137a9aa 100644
--- a/string/aarch64/strcmp.S
+++ b/string/aarch64/strcmp.S
@@ -1,168 +1,184 @@
 /*
  * strcmp - compare two strings
  *
- * Copyright (c) 2012-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
+
 /* Assumptions:
  *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64.
+ * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
 
-/* Parameters and result.  */
 #define src1		x0
 #define src2		x1
 #define result		x0
 
-/* Internal variables.  */
 #define data1		x2
 #define data1w		w2
 #define data2		x3
 #define data2w		w3
 #define has_nul		x4
 #define diff		x5
+#define off1		x5
 #define syndrome	x6
-#define tmp1		x7
-#define tmp2		x8
-#define tmp3		x9
-#define zeroones	x10
-#define pos		x11
+#define tmp		x6
+#define data3		x7
+#define zeroones	x8
+#define shift		x9
+#define off2		x10
 
-	/* Start of performance-critical section  -- one 64B cache line.  */
+/* On big-endian early bytes are at MSB and on little-endian LSB.
+   LS_FW means shifting towards early bytes.  */
+#ifdef __AARCH64EB__
+# define LS_FW lsl
+#else
+# define LS_FW lsr
+#endif
+
+/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+   can be done in parallel across the entire word.
+   Since carry propagation makes 0x1 bytes before a NUL byte appear
+   NUL too in big-endian, byte-reverse the data before the NUL check.  */
+
+
 ENTRY (__strcmp_aarch64)
 	PTR_ARG (0)
 	PTR_ARG (1)
-	eor	tmp1, src1, src2
-	mov	zeroones, #REP8_01
-	tst	tmp1, #7
+	sub	off2, src2, src1
+	mov	zeroones, REP8_01
+	and	tmp, src1, 7
+	tst	off2, 7
 	b.ne	L(misaligned8)
-	ands	tmp1, src1, #7
-	b.ne	L(mutual_align)
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word.  */
-L(loop_aligned):
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-L(start_realigned):
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
-	orr	syndrome, diff, has_nul
-	cbz	syndrome, L(loop_aligned)
-	/* End of performance-critical section  -- one 64B cache line.  */
+	cbnz	tmp, L(mutual_align)
 
+	.p2align 4
+
+L(loop_aligned):
+	ldr	data2, [src1, off2]
+	ldr	data1, [src1], 8
+L(start_realigned):
+#ifdef __AARCH64EB__
+	rev	tmp, data1
+	sub	has_nul, tmp, zeroones
+	orr	tmp, tmp, REP8_7f
+#else
+	sub	has_nul, data1, zeroones
+	orr	tmp, data1, REP8_7f
+#endif
+	bics	has_nul, has_nul, tmp	/* Non-zero if NUL terminator.  */
+	ccmp	data1, data2, 0, eq
+	b.eq	L(loop_aligned)
+#ifdef __AARCH64EB__
+	rev	has_nul, has_nul
+#endif
+	eor	diff, data1, data2
+	orr	syndrome, diff, has_nul
 L(end):
-#ifndef	__AARCH64EB__
+#ifndef __AARCH64EB__
 	rev	syndrome, syndrome
 	rev	data1, data1
-	/* The MS-non-zero bit of the syndrome marks either the first bit
-	   that is different, or the top bit of the first zero byte.
-	   Shifting left now will bring the critical information into the
-	   top bits.  */
-	clz	pos, syndrome
 	rev	data2, data2
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/* But we need to zero-extend (char is unsigned) the value and then
-	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	ret
-#else
-	/* For big-endian we cannot use the trick with the syndrome value
-	   as carry-propagation can corrupt the upper bits if the trailing
-	   bytes in the string contain 0x01.  */
-	/* However, if there is no NUL byte in the dword, we can generate
-	   the result directly.  We can't just subtract the bytes as the
-	   MSB might be significant.  */
-	cbnz	has_nul, 1f
-	cmp	data1, data2
-	cset	result, ne
-	cneg	result, result, lo
-	ret
-1:
-	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
-	rev	tmp3, data1
-	sub	tmp1, tmp3, zeroones
-	orr	tmp2, tmp3, #REP8_7f
-	bic	has_nul, tmp1, tmp2
-	rev	has_nul, has_nul
-	orr	syndrome, diff, has_nul
-	clz	pos, syndrome
-	/* The MS-non-zero bit of the syndrome marks either the first bit
-	   that is different, or the top bit of the first zero byte.
+#endif
+	clz	shift, syndrome
+	/* The most-significant-non-zero bit of the syndrome marks either the
+	   first bit that is different, or the top bit of the first zero byte.
 	   Shifting left now will bring the critical information into the
 	   top bits.  */
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
+	lsl	data1, data1, shift
+	lsl	data2, data2, shift
 	/* But we need to zero-extend (char is unsigned) the value and then
 	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
+	lsr	data1, data1, 56
+	sub	result, data1, data2, lsr 56
 	ret
-#endif
+
+	.p2align 4
 
 L(mutual_align):
 	/* Sources are mutually aligned, but are not currently at an
 	   alignment boundary.  Round down the addresses and then mask off
-	   the bytes that preceed the start point.  */
-	bic	src1, src1, #7
-	bic	src2, src2, #7
-	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
-	ldr	data1, [src1], #8
-	neg	tmp1, tmp1		/* Bits to alignment -64.  */
-	ldr	data2, [src2], #8
-	mov	tmp2, #~0
-#ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#endif
-	orr	data1, data1, tmp2
-	orr	data2, data2, tmp2
+	   the bytes that precede the start point.  */
+	bic	src1, src1, 7
+	ldr	data2, [src1, off2]
+	ldr	data1, [src1], 8
+	neg	shift, src2, lsl 3	/* Bits to alignment -64.  */
+	mov	tmp, -1
+	LS_FW	tmp, tmp, shift
+	orr	data1, data1, tmp
+	orr	data2, data2, tmp
 	b	L(start_realigned)
 
 L(misaligned8):
 	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
-	   checking to make sure that we don't access beyond page boundary in
-	   SRC2.  */
-	tst	src1, #7
-	b.eq	L(loop_misaligned)
+	   checking to make sure that we don't access beyond the end of SRC2.  */
+	cbz	tmp, L(src1_aligned)
 L(do_misaligned):
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	cmp	data1w, #1
-	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	ldrb	data1w, [src1], 1
+	ldrb	data2w, [src2], 1
+	cmp	data1w, 0
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
 	b.ne	L(done)
-	tst	src1, #7
+	tst	src1, 7
 	b.ne	L(do_misaligned)
 
-L(loop_misaligned):
-	/* Test if we are within the last dword of the end of a 4K page.  If
-	   yes then jump back to the misaligned loop to copy a byte at a time.  */
-	and	tmp1, src2, #0xff8
-	eor	tmp1, tmp1, #0xff8
-	cbz	tmp1, L(do_misaligned)
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
+L(src1_aligned):
+	neg	shift, src2, lsl 3
+	bic	src2, src2, 7
+	ldr	data3, [src2], 8
+#ifdef __AARCH64EB__
+	rev	data3, data3
+#endif
+	lsr	tmp, zeroones, shift
+	orr	data3, data3, tmp
+	sub	has_nul, data3, zeroones
+	orr	tmp, data3, REP8_7f
+	bics	has_nul, has_nul, tmp
+	b.ne	L(tail)
 
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	sub	off1, src2, src1
+
+	.p2align 4
+
+L(loop_unaligned):
+	ldr	data3, [src1, off1]
+	ldr	data2, [src1, off2]
+#ifdef __AARCH64EB__
+	rev	data3, data3
+#endif
+	sub	has_nul, data3, zeroones
+	orr	tmp, data3, REP8_7f
+	ldr	data1, [src1], 8
+	bics	has_nul, has_nul, tmp
+	ccmp	data1, data2, 0, eq
+	b.eq	L(loop_unaligned)
+
+	lsl	tmp, has_nul, shift
+#ifdef __AARCH64EB__
+	rev	tmp, tmp
+#endif
+	eor	diff, data1, data2
+	orr	syndrome, diff, tmp
+	cbnz	syndrome, L(end)
+L(tail):
+	ldr	data1, [src1]
+	neg	shift, shift
+	lsr	data2, data3, shift
+	lsr	has_nul, has_nul, shift
+#ifdef __AARCH64EB__
+	rev     data2, data2
+	rev	has_nul, has_nul
+#endif
+	eor	diff, data1, data2
 	orr	syndrome, diff, has_nul
-	cbz	syndrome, L(loop_misaligned)
 	b	L(end)
 
 L(done):
diff --git a/string/aarch64/strcpy-mte.S b/string/aarch64/strcpy-mte.S
deleted file mode 100644
index 88c222d..0000000
--- a/string/aarch64/strcpy-mte.S
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * strcpy/stpcpy - copy a string returning pointer to start/end.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, Advanced SIMD.
- * MTE compatible.
- */
-
-#include "../asmdefs.h"
-
-#define dstin		x0
-#define srcin		x1
-#define result		x0
-
-#define src		x2
-#define dst		x3
-#define len		x4
-#define synd		x4
-#define	tmp		x5
-#define wtmp		w5
-#define shift		x5
-#define data1		x6
-#define dataw1		w6
-#define data2		x7
-#define dataw2		w7
-
-#define dataq		q0
-#define vdata		v0
-#define vhas_nul	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
-#define dataq2		q1
-
-#ifdef BUILD_STPCPY
-# define STRCPY __stpcpy_aarch64_mte
-# define IFSTPCPY(X,...) X,__VA_ARGS__
-#else
-# define STRCPY __strcpy_aarch64_mte
-# define IFSTPCPY(X,...)
-#endif
-
-/* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
-
-ENTRY (STRCPY)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
-	ld1	{vdata.16b}, [src]
-	dup	vrepmask.8h, wtmp
-	cmeq	vhas_nul.16b, vdata.16b, 0
-	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
-	fmov	synd, dend
-	lsr	synd, synd, shift
-	cbnz	synd, L(tail)
-
-	ldr	dataq, [src, 16]!
-	cmeq	vhas_nul.16b, vdata.16b, 0
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
-	fmov	synd, dend
-	cbz	synd, L(start_loop)
-
-#ifndef __AARCH64EB__
-	rbit	synd, synd
-#endif
-	sub	tmp, src, srcin
-	clz	len, synd
-	add	len, tmp, len, lsr 2
-	tbz	len, 4, L(less16)
-	sub	tmp, len, 15
-	ldr	dataq, [srcin]
-	ldr	dataq2, [srcin, tmp]
-	str	dataq, [dstin]
-	str	dataq2, [dstin, tmp]
-	IFSTPCPY (add result, dstin, len)
-	ret
-
-	.p2align 4,,8
-L(tail):
-	rbit	synd, synd
-	clz	len, synd
-	lsr	len, len, 2
-
-	.p2align 4
-L(less16):
-	tbz	len, 3, L(less8)
-	sub	tmp, len, 7
-	ldr	data1, [srcin]
-	ldr	data2, [srcin, tmp]
-	str	data1, [dstin]
-	str	data2, [dstin, tmp]
-	IFSTPCPY (add result, dstin, len)
-	ret
-
-	.p2align 4
-L(less8):
-	subs	tmp, len, 3
-	b.lo	L(less4)
-	ldr	dataw1, [srcin]
-	ldr	dataw2, [srcin, tmp]
-	str	dataw1, [dstin]
-	str	dataw2, [dstin, tmp]
-	IFSTPCPY (add result, dstin, len)
-	ret
-
-L(less4):
-	cbz	len, L(zerobyte)
-	ldrh	dataw1, [srcin]
-	strh	dataw1, [dstin]
-L(zerobyte):
-	strb	wzr, [dstin, len]
-	IFSTPCPY (add result, dstin, len)
-	ret
-
-	.p2align 4
-L(start_loop):
-	sub	len, src, srcin
-	ldr	dataq2, [srcin]
-	add	dst, dstin, len
-	str	dataq2, [dstin]
-
-	.p2align 5
-L(loop):
-	str	dataq, [dst], 16
-	ldr	dataq, [src, 16]!
-	cmeq	vhas_nul.16b, vdata.16b, 0
-	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
-	fmov	synd, dend
-	cbz	synd, L(loop)
-
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
-	fmov	synd, dend
-#ifndef __AARCH64EB__
-	rbit	synd, synd
-#endif
-	clz	len, synd
-	lsr	len, len, 2
-	sub	tmp, len, 15
-	ldr	dataq, [src, tmp]
-	str	dataq, [dst, tmp]
-	IFSTPCPY (add result, dst, len)
-	ret
-
-END (STRCPY)
diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S
index f515462..00e72dc 100644
--- a/string/aarch64/strcpy-sve.S
+++ b/string/aarch64/strcpy-sve.S
@@ -1,11 +1,11 @@
 /*
  * strcpy/stpcpy - copy a string returning pointer to start/end.
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index 6e9ed42..97ae37e 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -1,311 +1,156 @@
 /*
  * strcpy/stpcpy - copy a string returning pointer to start/end.
  *
- * Copyright (c) 2013-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
-/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
-
-   To test the page crossing code path more thoroughly, compile with
-   -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
-   entry path.  This option is not intended for production use.  */
-
-/* Arguments and results.  */
 #define dstin		x0
 #define srcin		x1
+#define result		x0
 
-/* Locals and temporaries.  */
 #define src		x2
 #define dst		x3
-#define data1		x4
-#define data1w		w4
-#define data2		x5
-#define data2w		w5
-#define has_nul1	x6
-#define has_nul2	x7
-#define tmp1		x8
-#define tmp2		x9
-#define tmp3		x10
-#define tmp4		x11
-#define zeroones	x12
-#define data1a		x13
-#define data2a		x14
-#define pos		x15
-#define len		x16
-#define to_align	x17
+#define len		x4
+#define synd		x4
+#define	tmp		x5
+#define shift		x5
+#define data1		x6
+#define dataw1		w6
+#define data2		x7
+#define dataw2		w7
+
+#define dataq		q0
+#define vdata		v0
+#define vhas_nul	v1
+#define vend		v2
+#define dend		d2
+#define dataq2		q1
 
 #ifdef BUILD_STPCPY
-#define STRCPY __stpcpy_aarch64
+# define STRCPY __stpcpy_aarch64
+# define IFSTPCPY(X,...) X,__VA_ARGS__
 #else
-#define STRCPY __strcpy_aarch64
+# define STRCPY __strcpy_aarch64
+# define IFSTPCPY(X,...)
 #endif
 
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word.  */
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
-
-	/* AArch64 systems have a minimum page size of 4k.  We can do a quick
-	   page size check for crossing this boundary on entry and if we
-	   do not, then we can short-circuit much of the entry code.  We
-	   expect early page-crossing strings to be rare (probability of
-	   16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
-	   predictable, even with random strings.
-
-	   We don't bother checking for larger page sizes, the cost of setting
-	   up the correct page size is just not worth the extra gain from
-	   a small reduction in the cases taking the slow path.  Note that
-	   we only care about whether the first fetch, which may be
-	   misaligned, crosses a page boundary - after that we move to aligned
-	   fetches for the remainder of the string.  */
-
-#ifdef STRCPY_TEST_PAGE_CROSS
-	/* Make everything that isn't Qword aligned look like a page cross.  */
-#define MIN_PAGE_P2 4
-#else
-#define MIN_PAGE_P2 12
-#endif
-
-#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (STRCPY)
 	PTR_ARG (0)
 	PTR_ARG (1)
-	/* For moderately short strings, the fastest way to do the copy is to
-	   calculate the length of the string in the same way as strlen, then
-	   essentially do a memcpy of the result.  This avoids the need for
-	   multiple byte copies and further means that by the time we
-	   reach the bulk copy loop we know we can always use DWord
-	   accesses.  We expect __strcpy_aarch64 to rarely be called repeatedly
-	   with the same source string, so branch prediction is likely to
-	   always be difficult - we mitigate against this by preferring
-	   conditional select operations over branches whenever this is
-	   feasible.  */
-	and	tmp2, srcin, #(MIN_PAGE_SIZE - 1)
-	mov	zeroones, #REP8_01
-	and	to_align, srcin, #15
-	cmp	tmp2, #(MIN_PAGE_SIZE - 16)
-	neg	tmp1, to_align
-	/* The first fetch will straddle a (possible) page boundary iff
-	   srcin + 15 causes bit[MIN_PAGE_P2] to change value.  A 16-byte
-	   aligned string will never fail the page align check, so will
-	   always take the fast path.  */
-	b.gt	L(page_cross)
+	bic	src, srcin, 15
+	ld1	{vdata.16b}, [src]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	lsl	shift, srcin, 2
+	shrn	vend.8b, vhas_nul.8h, 4
+	fmov	synd, dend
+	lsr	synd, synd, shift
+	cbnz	synd, L(tail)
 
-L(page_cross_ok):
-	ldp	data1, data2, [srcin]
-#ifdef __AARCH64EB__
-	/* Because we expect the end to be found within 16 characters
-	   (profiling shows this is the most common case), it's worth
-	   swapping the bytes now to save having to recalculate the
-	   termination syndrome later.  We preserve data1 and data2
-	   so that we can re-use the values later on.  */
-	rev	tmp2, data1
-	sub	tmp1, tmp2, zeroones
-	orr	tmp2, tmp2, #REP8_7f
-	bics	has_nul1, tmp1, tmp2
-	b.ne	L(fp_le8)
-	rev	tmp4, data2
-	sub	tmp3, tmp4, zeroones
-	orr	tmp4, tmp4, #REP8_7f
-#else
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	bics	has_nul1, tmp1, tmp2
-	b.ne	L(fp_le8)
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-#endif
-	bics	has_nul2, tmp3, tmp4
-	b.eq	L(bulk_entry)
+	ldr	dataq, [src, 16]!
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	shrn	vend.8b, vhas_nul.8h, 4
+	fmov	synd, dend
+	cbz	synd, L(start_loop)
 
-	/* The string is short (<=16 bytes).  We don't know exactly how
-	   short though, yet.  Work out the exact length so that we can
-	   quickly select the optimal copy strategy.  */
-L(fp_gt8):
-	rev	has_nul2, has_nul2
-	clz	pos, has_nul2
-	mov	tmp2, #56
-	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
-	sub	pos, tmp2, pos
-#ifdef __AARCH64EB__
-	lsr	data2, data2, pos
-#else
-	lsl	data2, data2, pos
+#ifndef __AARCH64EB__
+	rbit	synd, synd
 #endif
-	str	data2, [dst, #1]
+	sub	tmp, src, srcin
+	clz	len, synd
+	add	len, tmp, len, lsr 2
+	tbz	len, 4, L(less16)
+	sub	tmp, len, 15
+	ldr	dataq, [srcin]
+	ldr	dataq2, [srcin, tmp]
+	str	dataq, [dstin]
+	str	dataq2, [dstin, tmp]
+	IFSTPCPY (add result, dstin, len)
+	ret
+
+L(tail):
+	rbit	synd, synd
+	clz	len, synd
+	lsr	len, len, 2
+L(less16):
+	tbz	len, 3, L(less8)
+	sub	tmp, len, 7
+	ldr	data1, [srcin]
+	ldr	data2, [srcin, tmp]
 	str	data1, [dstin]
-#ifdef BUILD_STPCPY
-	add	dstin, dst, #8
-#endif
+	str	data2, [dstin, tmp]
+	IFSTPCPY (add result, dstin, len)
 	ret
 
-L(fp_le8):
-	rev	has_nul1, has_nul1
-	clz	pos, has_nul1
-	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
-	subs	tmp2, pos, #24			/* Pos in bits. */
-	b.lt	L(fp_lt4)
-#ifdef __AARCH64EB__
-	mov	tmp2, #56
-	sub	pos, tmp2, pos
-	lsr	data2, data1, pos
-	lsr	data1, data1, #32
-#else
-	lsr	data2, data1, tmp2
-#endif
-	/* 4->7 bytes to copy.  */
-	str	data2w, [dst, #-3]
-	str	data1w, [dstin]
-#ifdef BUILD_STPCPY
-	mov	dstin, dst
-#endif
-	ret
-L(fp_lt4):
-	cbz	pos, L(fp_lt2)
-	/* 2->3 bytes to copy.  */
-#ifdef __AARCH64EB__
-	lsr	data1, data1, #48
-#endif
-	strh	data1w, [dstin]
-	/* Fall-through, one byte (max) to go.  */
-L(fp_lt2):
-	/* Null-terminated string.  Last character must be zero!  */
-	strb	wzr, [dst]
-#ifdef BUILD_STPCPY
-	mov	dstin, dst
-#endif
+	.p2align 4
+L(less8):
+	subs	tmp, len, 3
+	b.lo	L(less4)
+	ldr	dataw1, [srcin]
+	ldr	dataw2, [srcin, tmp]
+	str	dataw1, [dstin]
+	str	dataw2, [dstin, tmp]
+	IFSTPCPY (add result, dstin, len)
 	ret
 
-	.p2align 6
-	/* Aligning here ensures that the entry code and main loop all lies
-	   within one 64-byte cache line.  */
-L(bulk_entry):
-	sub	to_align, to_align, #16
-	stp	data1, data2, [dstin]
-	sub	src, srcin, to_align
-	sub	dst, dstin, to_align
-	b	L(entry_no_page_cross)
-
-	/* The inner loop deals with two Dwords at a time.  This has a
-	   slightly higher start-up cost, but we should win quite quickly,
-	   especially on cores with a high number of issue slots per
-	   cycle, as we get much better parallelism out of the operations.  */
-L(main_loop):
-	stp	data1, data2, [dst], #16
-L(entry_no_page_cross):
-	ldp	data1, data2, [src], #16
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-	bics	has_nul2, tmp3, tmp4
-	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
-	b.eq	L(main_loop)
-
-	/* Since we know we are copying at least 16 bytes, the fastest way
-	   to deal with the tail is to determine the location of the
-	   trailing NUL, then (re)copy the 16 bytes leading up to that.  */
-	cmp	has_nul1, #0
-#ifdef __AARCH64EB__
-	/* For big-endian, carry propagation (if the final byte in the
-	   string is 0x01) means we cannot use has_nul directly.  The
-	   easiest way to get the correct byte is to byte-swap the data
-	   and calculate the syndrome a second time.  */
-	csel	data1, data1, data2, ne
-	rev	data1, data1
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-#else
-	csel	has_nul1, has_nul1, has_nul2, ne
-#endif
-	rev	has_nul1, has_nul1
-	clz	pos, has_nul1
-	add	tmp1, pos, #72
-	add	pos, pos, #8
-	csel	pos, pos, tmp1, ne
-	add	src, src, pos, lsr #3
-	add	dst, dst, pos, lsr #3
-	ldp	data1, data2, [src, #-32]
-	stp	data1, data2, [dst, #-16]
-#ifdef BUILD_STPCPY
-	sub	dstin, dst, #1
-#endif
+L(less4):
+	cbz	len, L(zerobyte)
+	ldrh	dataw1, [srcin]
+	strh	dataw1, [dstin]
+L(zerobyte):
+	strb	wzr, [dstin, len]
+	IFSTPCPY (add result, dstin, len)
 	ret
 
-L(page_cross):
-	bic	src, srcin, #15
-	/* Start by loading two words at [srcin & ~15], then forcing the
-	   bytes that precede srcin to 0xff.  This means they never look
-	   like termination bytes.  */
-	ldp	data1, data2, [src]
-	lsl	tmp1, tmp1, #3	/* Bytes beyond alignment -> bits.  */
-	tst	to_align, #7
-	csetm	tmp2, ne
-#ifdef __AARCH64EB__
-	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#else
-	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+	.p2align 4
+L(start_loop):
+	sub	tmp, srcin, dstin
+	ldr	dataq2, [srcin]
+	sub	dst, src, tmp
+	str	dataq2, [dstin]
+L(loop):
+	str	dataq, [dst], 32
+	ldr	dataq, [src, 16]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbnz	synd, L(loopend)
+	str	dataq, [dst, -16]
+	ldr	dataq, [src, 32]!
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbz	synd, L(loop)
+	add	dst, dst, 16
+L(loopend):
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
+	fmov	synd, dend
+	sub	dst, dst, 31
+#ifndef __AARCH64EB__
+	rbit	synd, synd
 #endif
-	orr	data1, data1, tmp2
-	orr	data2a, data2, tmp2
-	cmp	to_align, #8
-	csinv	data1, data1, xzr, lt
-	csel	data2, data2, data2a, lt
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-	bics	has_nul2, tmp3, tmp4
-	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
-	b.eq	L(page_cross_ok)
-	/* We now need to make data1 and data2 look like they've been
-	   loaded directly from srcin.  Do a rotate on the 128-bit value.  */
-	lsl	tmp1, to_align, #3	/* Bytes->bits.  */
-	neg	tmp2, to_align, lsl #3
-#ifdef __AARCH64EB__
-	lsl	data1a, data1, tmp1
-	lsr	tmp4, data2, tmp2
-	lsl	data2, data2, tmp1
-	orr	tmp4, tmp4, data1a
-	cmp	to_align, #8
-	csel	data1, tmp4, data2, lt
-	rev	tmp2, data1
-	rev	tmp4, data2
-	sub	tmp1, tmp2, zeroones
-	orr	tmp2, tmp2, #REP8_7f
-	sub	tmp3, tmp4, zeroones
-	orr	tmp4, tmp4, #REP8_7f
-#else
-	lsr	data1a, data1, tmp1
-	lsl	tmp4, data2, tmp2
-	lsr	data2, data2, tmp1
-	orr	tmp4, tmp4, data1a
-	cmp	to_align, #8
-	csel	data1, tmp4, data2, lt
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-#endif
-	bic	has_nul1, tmp1, tmp2
-	cbnz	has_nul1, L(fp_le8)
-	bic	has_nul2, tmp3, tmp4
-	b	L(fp_gt8)
+	clz	len, synd
+	lsr	len, len, 2
+	add	dst, dst, len
+	ldr	dataq, [dst, tmp]
+	str	dataq, [dst]
+	IFSTPCPY (add result, dst, 15)
+	ret
 
 END (STRCPY)
-
diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S
index 7cf41d5..7723579 100644
--- a/string/aarch64/strlen-mte.S
+++ b/string/aarch64/strlen-mte.S
@@ -1,8 +1,8 @@
 /*
  * strlen - calculate the length of a string.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define result		x0
@@ -19,35 +19,26 @@
 #define src		x1
 #define	synd		x2
 #define tmp		x3
-#define wtmp		w3
 #define shift		x4
 
 #define data		q0
 #define vdata		v0
 #define vhas_nul	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 
 /* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
+   four bits per byte using the shrn instruction. A count trailing zeros then
+   identifies the first zero byte.  */
 
 ENTRY (__strlen_aarch64_mte)
 	PTR_ARG (0)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(loop)
@@ -59,19 +50,25 @@
 
 	.p2align 5
 L(loop):
-	ldr	data, [src, 16]!
+	ldr	data, [src, 16]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbnz	synd, L(loop_end)
+	ldr	data, [src, 32]!
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	synd, dend
 	cbz	synd, L(loop)
-
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	sub	src, src, 16
+L(loop_end):
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	sub	result, src, srcin
 	fmov	synd, dend
 #ifndef __AARCH64EB__
 	rbit	synd, synd
 #endif
+	add	result, result, 16
 	clz	tmp, synd
 	add	result, result, tmp, lsr 2
 	ret
diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S
index 2392493..12ebbdb 100644
--- a/string/aarch64/strlen-sve.S
+++ b/string/aarch64/strlen-sve.S
@@ -1,11 +1,11 @@
 /*
  * __strlen_aarch64_sve - compute the length of a string
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
index a1b164a..6f6f08f 100644
--- a/string/aarch64/strlen.S
+++ b/string/aarch64/strlen.S
@@ -1,8 +1,8 @@
 /*
  * strlen - calculate the length of a string.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * Not MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin	x0
 #define len	x0
@@ -36,6 +36,7 @@
 #define tmp	x2
 #define tmpw	w2
 #define synd	x3
+#define syndw	w3
 #define shift	x4
 
 /* For the first 32 bytes, NUL detection works on the principle that
@@ -110,7 +111,6 @@
 	add	len, len, tmp1, lsr 3
 	ret
 
-	.p2align 3
 	/* Look for a NUL byte at offset 16..31 in the string.  */
 L(bytes16_31):
 	ldp	data1, data2, [srcin, 16]
@@ -138,6 +138,7 @@
 	add	len, len, tmp1, lsr 3
 	ret
 
+	nop
 L(loop_entry):
 	bic	src, srcin, 31
 
@@ -153,18 +154,12 @@
 	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
 	cmeq	maskv.16b, datav1.16b, 0
 	sub	len, src, srcin
-	tst	synd, 0xffffffff
-	b.ne	1f
+	cbnz	syndw, 1f
 	cmeq	maskv.16b, datav2.16b, 0
 	add	len, len, 16
 1:
 	/* Generate a bitmask and compute correct byte offset.  */
-#ifdef __AARCH64EB__
-	bic	maskv.8h, 0xf0
-#else
-	bic	maskv.8h, 0x0f, lsl 8
-#endif
-	umaxp	maskv.16b, maskv.16b, maskv.16b
+	shrn	maskv.8b, maskv.8h, 4
 	fmov	synd, maskd
 #ifndef __AARCH64EB__
 	rbit	synd, synd
@@ -173,8 +168,6 @@
 	add	len, len, tmp, lsr 2
 	ret
 
-        .p2align 4
-
 L(page_cross):
 	bic	src, srcin, 31
 	mov	tmpw, 0x0c03
diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S
deleted file mode 100644
index c9d6fc8..0000000
--- a/string/aarch64/strncmp-mte.S
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * strncmp - compare two strings
- *
- * Copyright (c) 2013-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64
- */
-
-#include "../asmdefs.h"
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-
-/* Parameters and result.  */
-#define src1		x0
-#define src2		x1
-#define limit		x2
-#define result		x0
-
-/* Internal variables.  */
-#define data1		x3
-#define data1w		w3
-#define data2		x4
-#define data2w		w4
-#define has_nul		x5
-#define diff		x6
-#define syndrome	x7
-#define tmp1		x8
-#define tmp2		x9
-#define tmp3		x10
-#define zeroones	x11
-#define pos		x12
-#define mask		x13
-#define endloop		x14
-#define count		mask
-#define offset		pos
-#define neg_offset	x15
-
-/* Define endian dependent shift operations.
-   On big-endian early bytes are at MSB and on little-endian LSB.
-   LS_FW means shifting towards early bytes.
-   LS_BK means shifting towards later bytes.
-   */
-#ifdef __AARCH64EB__
-#define LS_FW lsl
-#define LS_BK lsr
-#else
-#define LS_FW lsr
-#define LS_BK lsl
-#endif
-
-ENTRY (__strncmp_aarch64_mte)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	SIZE_ARG (2)
-	cbz	limit, L(ret0)
-	eor	tmp1, src1, src2
-	mov	zeroones, #REP8_01
-	tst	tmp1, #7
-	and	count, src1, #7
-	b.ne	L(misaligned8)
-	cbnz	count, L(mutual_align)
-
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word.  */
-	.p2align 4
-L(loop_aligned):
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-L(start_realigned):
-	subs	limit, limit, #8
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
-	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
-	ccmp	endloop, #0, #0, eq
-	b.eq	L(loop_aligned)
-	/* End of main loop */
-
-L(full_check):
-#ifndef __AARCH64EB__
-	orr	syndrome, diff, has_nul
-	add	limit, limit, 8	/* Rewind limit to before last subs. */
-L(syndrome_check):
-	/* Limit was reached. Check if the NUL byte or the difference
-	   is before the limit. */
-	rev	syndrome, syndrome
-	rev	data1, data1
-	clz	pos, syndrome
-	rev	data2, data2
-	lsl	data1, data1, pos
-	cmp	limit, pos, lsr #3
-	lsl	data2, data2, pos
-	/* But we need to zero-extend (char is unsigned) the value and then
-	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	csel result, result, xzr, hi
-	ret
-#else
-	/* Not reached the limit, must have found the end or a diff.  */
-	tbz	limit, #63, L(not_limit)
-	add	tmp1, limit, 8
-	cbz	limit, L(not_limit)
-
-	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
-	mov	mask, #~0
-	lsr	mask, mask, limit
-	bic	data1, data1, mask
-	bic	data2, data2, mask
-
-	/* Make sure that the NUL byte is marked in the syndrome.  */
-	orr	has_nul, has_nul, mask
-
-L(not_limit):
-	/* For big-endian we cannot use the trick with the syndrome value
-	   as carry-propagation can corrupt the upper bits if the trailing
-	   bytes in the string contain 0x01.  */
-	/* However, if there is no NUL byte in the dword, we can generate
-	   the result directly.  We can't just subtract the bytes as the
-	   MSB might be significant.  */
-	cbnz	has_nul, 1f
-	cmp	data1, data2
-	cset	result, ne
-	cneg	result, result, lo
-	ret
-1:
-	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
-	rev	tmp3, data1
-	sub	tmp1, tmp3, zeroones
-	orr	tmp2, tmp3, #REP8_7f
-	bic	has_nul, tmp1, tmp2
-	rev	has_nul, has_nul
-	orr	syndrome, diff, has_nul
-	clz	pos, syndrome
-	/* The most-significant-non-zero bit of the syndrome marks either the
-	   first bit that is different, or the top bit of the first zero byte.
-	   Shifting left now will bring the critical information into the
-	   top bits.  */
-L(end_quick):
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/* But we need to zero-extend (char is unsigned) the value and then
-	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	ret
-#endif
-
-L(mutual_align):
-	/* Sources are mutually aligned, but are not currently at an
-	   alignment boundary.  Round down the addresses and then mask off
-	   the bytes that precede the start point.
-	   We also need to adjust the limit calculations, but without
-	   overflowing if the limit is near ULONG_MAX.  */
-	bic	src1, src1, #7
-	bic	src2, src2, #7
-	ldr	data1, [src1], #8
-	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
-	ldr	data2, [src2], #8
-	mov	tmp2, #~0
-	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
-	/* Adjust the limit and ensure it doesn't overflow.  */
-	adds	limit, limit, count
-	csinv	limit, limit, xzr, lo
-	orr	data1, data1, tmp2
-	orr	data2, data2, tmp2
-	b	L(start_realigned)
-
-	.p2align 4
-	/* Don't bother with dwords for up to 16 bytes.  */
-L(misaligned8):
-	cmp	limit, #16
-	b.hs	L(try_misaligned_words)
-
-L(byte_loop):
-	/* Perhaps we can do better than this.  */
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	subs	limit, limit, #1
-	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
-	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	L(byte_loop)
-L(done):
-	sub	result, data1, data2
-	ret
-	/* Align the SRC1 to a dword by doing a bytewise compare and then do
-	   the dword loop.  */
-L(try_misaligned_words):
-	cbz	count, L(src1_aligned)
-
-	neg	count, count
-	and	count, count, #7
-	sub	limit, limit, count
-
-L(page_end_loop):
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	cmp	data1w, #1
-	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.ne	L(done)
-	subs	count, count, #1
-	b.hi	L(page_end_loop)
-
-	/* The following diagram explains the comparison of misaligned strings.
-	   The bytes are shown in natural order. For little-endian, it is
-	   reversed in the registers. The "x" bytes are before the string.
-	   The "|" separates data that is loaded at one time.
-	   src1     | a a a a a a a a | b b b c c c c c | . . .
-	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
-
-	   After shifting in each step, the data looks like this:
-	                STEP_A              STEP_B              STEP_C
-	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
-	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
-
-	   The bytes with "0" are eliminated from the syndrome via mask.
-
-	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
-	   time from SRC2. The comparison happens in 3 steps. After each step
-	   the loop can exit, or read from SRC1 or SRC2. */
-L(src1_aligned):
-	/* Calculate offset from 8 byte alignment to string start in bits. No
-	   need to mask offset since shifts are ignoring upper bits. */
-	lsl	offset, src2, #3
-	bic	src2, src2, #0xf
-	mov	mask, -1
-	neg	neg_offset, offset
-	ldr	data1, [src1], #8
-	ldp	tmp1, tmp2, [src2], #16
-	LS_BK	mask, mask, neg_offset
-	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
-	/* Skip the first compare if data in tmp1 is irrelevant. */
-	tbnz	offset, 6, L(misaligned_mid_loop)
-
-L(loop_misaligned):
-	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
-	LS_FW	data2, tmp1, offset
-	LS_BK	tmp1, tmp2, neg_offset
-	subs	limit, limit, #8
-	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
-	sub	has_nul, data1, zeroones
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	orr	tmp3, data1, #REP8_7f
-	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
-	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
-	orr	tmp3, endloop, has_nul
-	cbnz	tmp3, L(full_check)
-
-	ldr	data1, [src1], #8
-L(misaligned_mid_loop):
-	/* STEP_B: Compare first part of data1 to second part of tmp2. */
-	LS_FW	data2, tmp2, offset
-#ifdef __AARCH64EB__
-	/* For big-endian we do a byte reverse to avoid carry-propagation
-	problem described above. This way we can reuse the has_nul in the
-	next step and also use syndrome value trick at the end. */
-	rev	tmp3, data1
-	#define data1_fixed tmp3
-#else
-	#define data1_fixed data1
-#endif
-	sub	has_nul, data1_fixed, zeroones
-	orr	tmp3, data1_fixed, #REP8_7f
-	eor	diff, data2, data1	/* Non-zero if differences found.  */
-	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
-#ifdef __AARCH64EB__
-	rev	has_nul, has_nul
-#endif
-	cmp	limit, neg_offset, lsr #3
-	orr	syndrome, diff, has_nul
-	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
-	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
-	cbnz	tmp3, L(syndrome_check)
-
-	/* STEP_C: Compare second part of data1 to first part of tmp1. */
-	ldp	tmp1, tmp2, [src2], #16
-	cmp	limit, #8
-	LS_BK	data2, tmp1, neg_offset
-	eor	diff, data2, data1	/* Non-zero if differences found.  */
-	orr	syndrome, diff, has_nul
-	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
-	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
-	cbnz	tmp3, L(syndrome_check)
-
-	ldr	data1, [src1], #8
-	sub	limit, limit, #8
-	b	L(loop_misaligned)
-
-#ifdef	__AARCH64EB__
-L(syndrome_check):
-	clz	pos, syndrome
-	cmp	pos, limit, lsl #3
-	b.lo	L(end_quick)
-#endif
-
-L(ret0):
-	mov	result, #0
-	ret
-END(__strncmp_aarch64_mte)
-
diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S
index 234190e..6a9e9f7 100644
--- a/string/aarch64/strncmp-sve.S
+++ b/string/aarch64/strncmp-sve.S
@@ -1,11 +1,11 @@
 /*
  * strncmp - compare two strings with limit
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
index 738b653..128a10c 100644
--- a/string/aarch64/strncmp.S
+++ b/string/aarch64/strncmp.S
@@ -1,20 +1,20 @@
 /*
  * strncmp - compare two strings
  *
- * Copyright (c) 2013-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2013-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64.
+ * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
 
 /* Parameters and result.  */
 #define src1		x0
@@ -35,10 +35,24 @@
 #define tmp3		x10
 #define zeroones	x11
 #define pos		x12
-#define limit_wd	x13
-#define mask		x14
-#define endloop		x15
+#define mask		x13
+#define endloop		x14
 #define count		mask
+#define offset		pos
+#define neg_offset	x15
+
+/* Define endian dependent shift operations.
+   On big-endian early bytes are at MSB and on little-endian LSB.
+   LS_FW means shifting towards early bytes.
+   LS_BK means shifting towards later bytes.
+   */
+#ifdef __AARCH64EB__
+#define LS_FW lsl
+#define LS_BK lsr
+#else
+#define LS_FW lsr
+#define LS_BK lsl
+#endif
 
 ENTRY (__strncmp_aarch64)
 	PTR_ARG (0)
@@ -51,9 +65,6 @@
 	and	count, src1, #7
 	b.ne	L(misaligned8)
 	cbnz	count, L(mutual_align)
-	/* Calculate the number of full and partial words -1.  */
-	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
-	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
 
 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
@@ -63,30 +74,45 @@
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
 L(start_realigned):
-	subs	limit_wd, limit_wd, #1
+	subs	limit, limit, #8
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
 	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, pl	/* Last Dword or differences.  */
+	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	endloop, #0, #0, eq
 	b.eq	L(loop_aligned)
 	/* End of main loop */
 
-	/* Not reached the limit, must have found the end or a diff.  */
-	tbz	limit_wd, #63, L(not_limit)
-
-	/* Limit % 8 == 0 => all bytes significant.  */
-	ands	limit, limit, #7
-	b.eq	L(not_limit)
-
-	lsl	limit, limit, #3	/* Bits -> bytes.  */
-	mov	mask, #~0
-#ifdef __AARCH64EB__
-	lsr	mask, mask, limit
+L(full_check):
+#ifndef __AARCH64EB__
+	orr	syndrome, diff, has_nul
+	add	limit, limit, 8	/* Rewind limit to before last subs. */
+L(syndrome_check):
+	/* Limit was reached. Check if the NUL byte or the difference
+	   is before the limit. */
+	rev	syndrome, syndrome
+	rev	data1, data1
+	clz	pos, syndrome
+	rev	data2, data2
+	lsl	data1, data1, pos
+	cmp	limit, pos, lsr #3
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	csel result, result, xzr, hi
+	ret
 #else
-	lsl	mask, mask, limit
-#endif
+	/* Not reached the limit, must have found the end or a diff.  */
+	tbz	limit, #63, L(not_limit)
+	add	tmp1, limit, 8
+	cbz	limit, L(not_limit)
+
+	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
+	mov	mask, #~0
+	lsr	mask, mask, limit
 	bic	data1, data1, mask
 	bic	data2, data2, mask
 
@@ -94,25 +120,6 @@
 	orr	has_nul, has_nul, mask
 
 L(not_limit):
-	orr	syndrome, diff, has_nul
-
-#ifndef	__AARCH64EB__
-	rev	syndrome, syndrome
-	rev	data1, data1
-	/* The MS-non-zero bit of the syndrome marks either the first bit
-	   that is different, or the top bit of the first zero byte.
-	   Shifting left now will bring the critical information into the
-	   top bits.  */
-	clz	pos, syndrome
-	rev	data2, data2
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/* But we need to zero-extend (char is unsigned) the value and then
-	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	ret
-#else
 	/* For big-endian we cannot use the trick with the syndrome value
 	   as carry-propagation can corrupt the upper bits if the trailing
 	   bytes in the string contain 0x01.  */
@@ -133,10 +140,11 @@
 	rev	has_nul, has_nul
 	orr	syndrome, diff, has_nul
 	clz	pos, syndrome
-	/* The MS-non-zero bit of the syndrome marks either the first bit
-	   that is different, or the top bit of the first zero byte.
+	/* The most-significant-non-zero bit of the syndrome marks either the
+	   first bit that is different, or the top bit of the first zero byte.
 	   Shifting left now will bring the critical information into the
 	   top bits.  */
+L(end_quick):
 	lsl	data1, data1, pos
 	lsl	data2, data2, pos
 	/* But we need to zero-extend (char is unsigned) the value and then
@@ -158,22 +166,12 @@
 	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
 	ldr	data2, [src2], #8
 	mov	tmp2, #~0
-	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
-#ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
-#else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
-#endif
-	and	tmp3, limit_wd, #7
-	lsr	limit_wd, limit_wd, #3
-	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
-	add	limit, limit, count
-	add	tmp3, tmp3, count
+	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
+	/* Adjust the limit and ensure it doesn't overflow.  */
+	adds	limit, limit, count
+	csinv	limit, limit, xzr, lo
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
-	add	limit_wd, limit_wd, tmp3, lsr #3
 	b	L(start_realigned)
 
 	.p2align 4
@@ -196,13 +194,11 @@
 	/* Align the SRC1 to a dword by doing a bytewise compare and then do
 	   the dword loop.  */
 L(try_misaligned_words):
-	lsr	limit_wd, limit, #3
-	cbz	count, L(do_misaligned)
+	cbz	count, L(src1_aligned)
 
 	neg	count, count
 	and	count, count, #7
 	sub	limit, limit, count
-	lsr	limit_wd, limit, #3
 
 L(page_end_loop):
 	ldrb	data1w, [src1], #1
@@ -213,48 +209,100 @@
 	subs	count, count, #1
 	b.hi	L(page_end_loop)
 
-L(do_misaligned):
-	/* Prepare ourselves for the next page crossing.  Unlike the aligned
-	   loop, we fetch 1 less dword because we risk crossing bounds on
-	   SRC2.  */
-	mov	count, #8
-	subs	limit_wd, limit_wd, #1
-	b.lo	L(done_loop)
+	/* The following diagram explains the comparison of misaligned strings.
+	   The bytes are shown in natural order. For little-endian, it is
+	   reversed in the registers. The "x" bytes are before the string.
+	   The "|" separates data that is loaded at one time.
+	   src1     | a a a a a a a a | b b b c c c c c | . . .
+	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
+
+	   After shifting in each step, the data looks like this:
+	                STEP_A              STEP_B              STEP_C
+	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
+	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
+
+	   The bytes with "0" are eliminated from the syndrome via mask.
+
+	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
+	   time from SRC2. The comparison happens in 3 steps. After each step
+	   the loop can exit, or read from SRC1 or SRC2. */
+L(src1_aligned):
+	/* Calculate offset from 8 byte alignment to string start in bits. No
+	   need to mask offset since shifts are ignoring upper bits. */
+	lsl	offset, src2, #3
+	bic	src2, src2, #0xf
+	mov	mask, -1
+	neg	neg_offset, offset
+	ldr	data1, [src1], #8
+	ldp	tmp1, tmp2, [src2], #16
+	LS_BK	mask, mask, neg_offset
+	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
+	/* Skip the first compare if data in tmp1 is irrelevant. */
+	tbnz	offset, 6, L(misaligned_mid_loop)
+
 L(loop_misaligned):
-	and	tmp2, src2, #0xff8
-	eor	tmp2, tmp2, #0xff8
-	cbz	tmp2, L(page_end_loop)
+	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
+	LS_FW	data2, tmp1, offset
+	LS_BK	tmp1, tmp2, neg_offset
+	subs	limit, limit, #8
+	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
+	sub	has_nul, data1, zeroones
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	orr	tmp3, data1, #REP8_7f
+	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
+	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
+	orr	tmp3, endloop, has_nul
+	cbnz	tmp3, L(full_check)
 
 	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
-	ccmp	diff, #0, #0, eq
-	b.ne	L(not_limit)
-	subs	limit_wd, limit_wd, #1
-	b.pl	L(loop_misaligned)
+L(misaligned_mid_loop):
+	/* STEP_B: Compare first part of data1 to second part of tmp2. */
+	LS_FW	data2, tmp2, offset
+#ifdef __AARCH64EB__
+	/* For big-endian we do a byte reverse to avoid carry-propagation
+	problem described above. This way we can reuse the has_nul in the
+	next step and also use syndrome value trick at the end. */
+	rev	tmp3, data1
+	#define data1_fixed tmp3
+#else
+	#define data1_fixed data1
+#endif
+	sub	has_nul, data1_fixed, zeroones
+	orr	tmp3, data1_fixed, #REP8_7f
+	eor	diff, data2, data1	/* Non-zero if differences found.  */
+	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
+#ifdef __AARCH64EB__
+	rev	has_nul, has_nul
+#endif
+	cmp	limit, neg_offset, lsr #3
+	orr	syndrome, diff, has_nul
+	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
+	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
+	cbnz	tmp3, L(syndrome_check)
 
-L(done_loop):
-	/* We found a difference or a NULL before the limit was reached.  */
-	and	limit, limit, #7
-	cbz	limit, L(not_limit)
-	/* Read the last word.  */
-	sub	src1, src1, 8
-	sub	src2, src2, 8
-	ldr	data1, [src1, limit]
-	ldr	data2, [src2, limit]
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
-	ccmp	diff, #0, #0, eq
-	b.ne	L(not_limit)
+	/* STEP_C: Compare second part of data1 to first part of tmp1. */
+	ldp	tmp1, tmp2, [src2], #16
+	cmp	limit, #8
+	LS_BK	data2, tmp1, neg_offset
+	eor	diff, data2, data1	/* Non-zero if differences found.  */
+	orr	syndrome, diff, has_nul
+	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
+	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
+	cbnz	tmp3, L(syndrome_check)
+
+	ldr	data1, [src1], #8
+	sub	limit, limit, #8
+	b	L(loop_misaligned)
+
+#ifdef	__AARCH64EB__
+L(syndrome_check):
+	clz	pos, syndrome
+	cmp	pos, limit, lsl #3
+	b.lo	L(end_quick)
+#endif
 
 L(ret0):
 	mov	result, #0
 	ret
-
-END ( __strncmp_aarch64)
+END(__strncmp_aarch64)
 
diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S
index 5b9ebf7..6c43dc4 100644
--- a/string/aarch64/strnlen-sve.S
+++ b/string/aarch64/strnlen-sve.S
@@ -1,11 +1,11 @@
 /*
  * strnlen - calculate the length of a string with limit.
  *
- * Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
index 48d2495..f2090a7 100644
--- a/string/aarch64/strnlen.S
+++ b/string/aarch64/strnlen.S
@@ -1,8 +1,8 @@
 /*
  * strnlen - calculate the length of a string with limit.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define cntin		x1
@@ -20,39 +20,30 @@
 #define src		x2
 #define synd		x3
 #define	shift		x4
-#define wtmp		w4
 #define tmp		x4
 #define cntrem		x5
 
 #define qdata		q0
 #define vdata		v0
 #define vhas_chr	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 
 /*
    Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
+   four bits per byte using the shrn instruction. A count trailing zeros then
+   identifies the first zero byte.  */
 
 ENTRY (__strnlen_aarch64)
 	PTR_ARG (0)
 	SIZE_ARG (1)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	cbz	cntin, L(nomatch)
-	ld1	{vdata.16b}, [src], 16
-	dup	vrepmask.8h, wtmp
+	ld1	{vdata.16b}, [src]
 	cmeq	vhas_chr.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -64,37 +55,40 @@
 	csel	result, cntin, result, ls
 	ret
 
+L(nomatch):
+	mov	result, cntin
+	ret
+
 L(start_loop):
 	sub	tmp, src, srcin
+	add	tmp, tmp, 17
 	subs	cntrem, cntin, tmp
-	b.ls	L(nomatch)
+	b.lo	L(nomatch)
 
 	/* Make sure that it won't overread by a 16-byte chunk */
-	add	tmp, cntrem, 15
-	tbnz	tmp, 4, L(loop32_2)
-
+	tbz	cntrem, 4, L(loop32_2)
+	sub	src, src, 16
 	.p2align 5
 L(loop32):
-	ldr	qdata, [src], 16
+	ldr	qdata, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, 0
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbnz	synd, L(end)
 L(loop32_2):
-	ldr	qdata, [src], 16
+	ldr	qdata, [src, 16]
 	subs	cntrem, cntrem, 32
 	cmeq	vhas_chr.16b, vdata.16b, 0
-	b.ls	L(end)
+	b.lo	L(end_2)
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(loop32)
-
+L(end_2):
+	add	src, src, 16
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
-	sub	src, src, 16
-	mov	synd, vend.d[0]
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	sub	result, src, srcin
+	fmov	synd, dend
 #ifndef __AARCH64EB__
 	rbit	synd, synd
 #endif
@@ -104,9 +98,5 @@
 	csel	result, cntin, result, ls
 	ret
 
-L(nomatch):
-	mov	result, cntin
-	ret
-
 END (__strnlen_aarch64)
 
diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S
index 1e4fb1a..bb61ab9 100644
--- a/string/aarch64/strrchr-mte.S
+++ b/string/aarch64/strrchr-mte.S
@@ -1,8 +1,8 @@
 /*
  * strrchr - find last position of a character in a string.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
@@ -19,7 +19,6 @@
 
 #define src		x2
 #define tmp		x3
-#define wtmp		w3
 #define synd		x3
 #define shift		x4
 #define src_match	x4
@@ -31,7 +30,6 @@
 #define vhas_nul	v2
 #define vhas_chr	v3
 #define vrepmask	v4
-#define vrepmask2	v5
 #define vend		v5
 #define dend		d5
 
@@ -47,55 +45,67 @@
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0x3003
-	dup	vrepmask.8h, wtmp
-	tst	srcin, 15
-	beq	L(loop1)
-
-	ld1	{vdata.16b}, [src], 16
+	movi	vrepmask.16b, 0x33
+	ld1	{vdata.16b}, [src]
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	mov	wtmp, 0xf00f
-	dup	vrepmask2.8h, wtmp
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4
 	lsl	shift, srcin, 2
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	lsl	synd, synd, shift
 	ands	nul_match, synd, 0xcccccccccccccccc
 	bne	L(tail)
-	cbnz	synd, L(loop2)
+	cbnz	synd, L(loop2_start)
 
-	.p2align 5
+	.p2align 4
 L(loop1):
-	ld1	{vdata.16b}, [src], 16
+	ldr	q1, [src, 16]
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbnz	synd, L(loop1_end)
+	ldr	q1, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	synd, dend
 	cbz	synd, L(loop1)
-
+	sub	src, src, 16
+L(loop1_end):
+	add	src, src, 16
 	cmeq	vhas_nul.16b, vdata.16b, 0
-	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	bic	vhas_nul.8h, 0x0f, lsl 8
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+#ifdef __AARCH64EB__
+	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	shrn	vend.8b, vhas_nul.8h, 4
 	fmov	synd, dend
+	rbit	synd, synd
+#else
+	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	shrn	vend.8b, vhas_nul.8h, 4
+	fmov	synd, dend
+#endif
 	ands	nul_match, synd, 0xcccccccccccccccc
-	beq	L(loop2)
-
+	beq	L(loop2_start)
 L(tail):
 	sub	nul_match, nul_match, 1
 	and	chr_match, synd, 0x3333333333333333
 	ands	chr_match, chr_match, nul_match
-	sub	result, src, 1
+	add	result, src, 15
 	clz	tmp, chr_match
 	sub	result, result, tmp, lsr 2
 	csel	result, result, xzr, ne
 	ret
 
 	.p2align 4
+	nop
+	nop
+L(loop2_start):
+	add	src, src, 16
+	bic	vrepmask.8h, 0xf0
+
 L(loop2):
 	cmp	synd, 0
 	csel	src_match, src, src_match, ne
diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S
index d36d69a..825a738 100644
--- a/string/aarch64/strrchr-sve.S
+++ b/string/aarch64/strrchr-sve.S
@@ -1,11 +1,11 @@
 /*
  * strrchr - find the last of a character in a string
  *
- * Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S
index 56185ff..bf9cb29 100644
--- a/string/aarch64/strrchr.S
+++ b/string/aarch64/strrchr.S
@@ -1,8 +1,8 @@
 /*
  * strrchr - find last position of a character in a string.
  *
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * Neon Available.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 /* Arguments and results.  */
 #define srcin		x0
diff --git a/string/arm/asmdefs.h b/string/arm/asmdefs.h
new file mode 100644
index 0000000..e311888
--- /dev/null
+++ b/string/arm/asmdefs.h
@@ -0,0 +1,477 @@
+/*
+ * Macros for asm code.  Arm version.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _ASMDEFS_H
+#define _ASMDEFS_H
+
+/* Check whether leaf function PAC signing has been requested in the
+   -mbranch-protect compile-time option.  */
+#define LEAF_PROTECT_BIT 2
+
+#ifdef __ARM_FEATURE_PAC_DEFAULT
+# define HAVE_PAC_LEAF \
+	((__ARM_FEATURE_PAC_DEFAULT & (1 << LEAF_PROTECT_BIT)) && 1)
+#else
+# define HAVE_PAC_LEAF 0
+#endif
+
+/* Provide default parameters for PAC-code handling in leaf-functions.  */
+#if HAVE_PAC_LEAF
+# ifndef PAC_LEAF_PUSH_IP
+#  define PAC_LEAF_PUSH_IP 1
+# endif
+#else /* !HAVE_PAC_LEAF */
+# undef PAC_LEAF_PUSH_IP
+# define PAC_LEAF_PUSH_IP 0
+#endif /* HAVE_PAC_LEAF */
+
+#define STACK_ALIGN_ENFORCE 0
+
+/******************************************************************************
+* Implementation of the prologue and epilogue assembler macros and their
+* associated helper functions.
+*
+* These functions add support for the following:
+*
+* - M-profile branch target identification (BTI) landing-pads when compiled
+*   with `-mbranch-protection=bti'.
+* - PAC-signing and verification instructions, depending on hardware support
+*   and whether the PAC-signing of leaf functions has been requested via the
+*   `-mbranch-protection=pac-ret+leaf' compiler argument.
+* - 8-byte stack alignment preservation at function entry, defaulting to the
+*   value of STACK_ALIGN_ENFORCE.
+*
+* Notes:
+* - Prologue stack alignment is implemented by detecting a push with an odd
+*   number of registers and prepending a dummy register to the list.
+* - If alignment is attempted on a list containing r0, compilation will result
+*   in an error.
+* - If alignment is attempted in a list containing r1, r0 will be prepended to
+*   the register list and r0 will be restored prior to function return.  for
+*   functions with non-void return types, this will result in the corruption of
+*   the result register.
+* - Stack alignment is enforced via the following helper macro call-chain:
+*
+*	{prologue|epilogue} ->_align8 -> _preprocess_reglist ->
+*		_preprocess_reglist1 -> {_prologue|_epilogue}
+*
+* - Debug CFI directives are automatically added to prologues and epilogues,
+*   assisted by `cfisavelist' and `cfirestorelist', respectively.
+*
+* Arguments:
+* prologue
+* --------
+* - first	- If `last' specified, this serves as start of general-purpose
+*		  register (GPR) range to push onto stack, otherwise represents
+*		  single GPR to push onto stack.  If omitted, no GPRs pushed
+*		  onto stack at prologue.
+* - last	- If given, specifies inclusive upper-bound of GPR range.
+* - push_ip	- Determines whether IP register is to be pushed to stack at
+*		  prologue.  When pac-signing is requested, this holds the
+*		  the pac-key.  Either 1 or 0 to push or not push, respectively.
+*		  Default behavior: Set to value of PAC_LEAF_PUSH_IP macro.
+* - push_lr	- Determines whether to push lr to the stack on function entry.
+*		  Either 1 or 0  to push or not push, respectively.
+* - align8	- Whether to enforce alignment. Either 1 or 0, with 1 requesting
+*		  alignment.
+*
+* epilogue
+* --------
+*   The epilogue should be called passing the same arguments as those passed to
+*   the prologue to ensure the stack is not corrupted on function return.
+*
+* Usage examples:
+*
+*   prologue push_ip=1 -> push {ip}
+*   epilogue push_ip=1, align8=1 -> pop {r2, ip}
+*   prologue push_ip=1, push_lr=1 -> push {ip, lr}
+*   epilogue 1 -> pop {r1}
+*   prologue 1, align8=1 -> push {r0, r1}
+*   epilogue 1, push_ip=1 -> pop {r1, ip}
+*   prologue 1, 4 -> push {r1-r4}
+*   epilogue 1, 4 push_ip=1 -> pop {r1-r4, ip}
+*
+******************************************************************************/
+
+/* Emit .cfi_restore directives for a consecutive sequence of registers.  */
+	.macro cfirestorelist first, last
+	.cfi_restore \last
+	.if \last-\first
+	 cfirestorelist \first, \last-1
+	.endif
+	.endm
+
+/* Emit .cfi_offset directives for a consecutive sequence of registers.  */
+	.macro cfisavelist first, last, index=1
+	.cfi_offset \last, -4*(\index)
+	.if \last-\first
+	 cfisavelist \first, \last-1, \index+1
+	.endif
+	.endm
+
+.macro _prologue first=-1, last=-1, push_ip=PAC_LEAF_PUSH_IP, push_lr=0
+	.if \push_ip & 1 != \push_ip
+	 .error "push_ip may be either 0 or 1"
+	.endif
+	.if \push_lr & 1 != \push_lr
+	 .error "push_lr may be either 0 or 1"
+	.endif
+	.if \first != -1
+	 .if \last == -1
+	  /* Upper-bound not provided: Set upper = lower.  */
+	  _prologue \first, \first, \push_ip, \push_lr
+	  .exitm
+	 .endif
+	.endif
+#if HAVE_PAC_LEAF
+# if __ARM_FEATURE_BTI_DEFAULT
+	pacbti	ip, lr, sp
+# else
+	pac	ip, lr, sp
+# endif /* __ARM_FEATURE_BTI_DEFAULT */
+	.cfi_register 143, 12
+#else
+# if __ARM_FEATURE_BTI_DEFAULT
+	bti
+# endif /* __ARM_FEATURE_BTI_DEFAULT */
+#endif /* HAVE_PAC_LEAF */
+	.if \first != -1
+	 .if \last != \first
+	  .if \last >= 13
+	.error "SP cannot be in the save list"
+	  .endif
+	  .if \push_ip
+	   .if \push_lr
+	/* Case 1: push register range, ip and lr registers.  */
+	push {r\first-r\last, ip, lr}
+	.cfi_adjust_cfa_offset ((\last-\first)+3)*4
+	.cfi_offset 14, -4
+	.cfi_offset 143, -8
+	cfisavelist \first, \last, 3
+	   .else // !\push_lr
+	/* Case 2: push register range and ip register.  */
+	push {r\first-r\last, ip}
+	.cfi_adjust_cfa_offset ((\last-\first)+2)*4
+	.cfi_offset 143, -4
+	cfisavelist \first, \last, 2
+	   .endif
+	  .else // !\push_ip
+	   .if \push_lr
+	/* Case 3: push register range and lr register.  */
+	push {r\first-r\last, lr}
+	.cfi_adjust_cfa_offset ((\last-\first)+2)*4
+	.cfi_offset 14, -4
+	cfisavelist \first, \last, 2
+	   .else // !\push_lr
+	/* Case 4: push register range.  */
+	push {r\first-r\last}
+	.cfi_adjust_cfa_offset ((\last-\first)+1)*4
+	cfisavelist \first, \last, 1
+	   .endif
+	  .endif
+	 .else // \last == \first
+	  .if \push_ip
+	   .if \push_lr
+	/* Case 5: push single GP register plus ip and lr registers.  */
+	push {r\first, ip, lr}
+	.cfi_adjust_cfa_offset 12
+	.cfi_offset 14, -4
+	.cfi_offset 143, -8
+        cfisavelist \first, \first, 3
+	   .else // !\push_lr
+	/* Case 6: push single GP register plus ip register.  */
+	push {r\first, ip}
+	.cfi_adjust_cfa_offset 8
+	.cfi_offset 143, -4
+        cfisavelist \first, \first, 2
+	   .endif
+	  .else // !\push_ip
+	   .if \push_lr
+	/* Case 7: push single GP register plus lr register.  */
+	push {r\first, lr}
+	.cfi_adjust_cfa_offset 8
+	.cfi_offset 14, -4
+	cfisavelist \first, \first, 2
+	   .else // !\push_lr
+	/* Case 8: push single GP register.  */
+	push {r\first}
+	.cfi_adjust_cfa_offset 4
+	cfisavelist \first, \first, 1
+	   .endif
+	  .endif
+	 .endif
+	.else // \first == -1
+	 .if \push_ip
+	  .if \push_lr
+	/* Case 9: push ip and lr registers.  */
+	push {ip, lr}
+	.cfi_adjust_cfa_offset 8
+	.cfi_offset 14, -4
+	.cfi_offset 143, -8
+	  .else // !\push_lr
+	/* Case 10: push ip register.  */
+	push {ip}
+	.cfi_adjust_cfa_offset 4
+	.cfi_offset 143, -4
+	  .endif
+	 .else // !\push_ip
+          .if \push_lr
+	/* Case 11: push lr register.  */
+	push {lr}
+	.cfi_adjust_cfa_offset 4
+	.cfi_offset 14, -4
+          .endif
+	 .endif
+	.endif
+.endm
+
+.macro _epilogue first=-1, last=-1, push_ip=PAC_LEAF_PUSH_IP, push_lr=0
+	.if \push_ip & 1 != \push_ip
+	 .error "push_ip may be either 0 or 1"
+	.endif
+	.if \push_lr & 1 != \push_lr
+	 .error "push_lr may be either 0 or 1"
+	.endif
+	.if \first != -1
+	 .if \last == -1
+	  /* Upper-bound not provided: Set upper = lower.  */
+	  _epilogue \first, \first, \push_ip, \push_lr
+	  .exitm
+	 .endif
+	 .if \last != \first
+	  .if \last >= 13
+	.error "SP cannot be in the save list"
+	  .endif
+	  .if \push_ip
+	   .if \push_lr
+	/* Case 1: pop register range, ip and lr registers.  */
+	pop {r\first-r\last, ip, lr}
+	.cfi_restore 14
+	.cfi_register 143, 12
+	cfirestorelist \first, \last
+	   .else // !\push_lr
+	/* Case 2: pop register range and ip register.  */
+	pop {r\first-r\last, ip}
+	.cfi_register 143, 12
+	cfirestorelist \first, \last
+	   .endif
+	  .else // !\push_ip
+	   .if \push_lr
+	/* Case 3: pop register range and lr register.  */
+	pop {r\first-r\last, lr}
+	.cfi_restore 14
+	cfirestorelist \first, \last
+	   .else // !\push_lr
+	/* Case 4: pop register range.  */
+	pop {r\first-r\last}
+	cfirestorelist \first, \last
+	   .endif
+	  .endif
+	 .else // \last == \first
+	  .if \push_ip
+	   .if \push_lr
+	/* Case 5: pop single GP register plus ip and lr registers.  */
+	pop {r\first, ip, lr}
+	.cfi_restore 14
+	.cfi_register 143, 12
+	cfirestorelist \first, \first
+	   .else // !\push_lr
+	/* Case 6: pop single GP register plus ip register.  */
+	pop {r\first, ip}
+	.cfi_register 143, 12
+	cfirestorelist \first, \first
+	   .endif
+	  .else // !\push_ip
+	   .if \push_lr
+	/* Case 7: pop single GP register plus lr register.  */
+	pop {r\first, lr}
+	.cfi_restore 14
+	cfirestorelist \first, \first
+	   .else // !\push_lr
+	/* Case 8: pop single GP register.  */
+	pop {r\first}
+	cfirestorelist \first, \first
+	   .endif
+	  .endif
+	 .endif
+	.else // \first == -1
+	 .if \push_ip
+	  .if \push_lr
+	/* Case 9: pop ip and lr registers.  */
+	pop {ip, lr}
+	.cfi_restore 14
+	.cfi_register 143, 12
+	  .else // !\push_lr
+	/* Case 10: pop ip register.  */
+	pop {ip}
+	.cfi_register 143, 12
+	  .endif
+	 .else // !\push_ip
+          .if \push_lr
+	/* Case 11: pop lr register.  */
+	pop {lr}
+	.cfi_restore 14
+          .endif
+	 .endif
+	.endif
+#if HAVE_PAC_LEAF
+	aut	ip, lr, sp
+#endif /* HAVE_PAC_LEAF */
+	bx	lr
+.endm
+
+/* Clean up expressions in 'last'.  */
+.macro _preprocess_reglist1 first:req, last:req, push_ip:req, push_lr:req, reglist_op:req
+	.if \last == 0
+	 \reglist_op \first, 0, \push_ip, \push_lr
+	.elseif \last == 1
+	 \reglist_op \first, 1, \push_ip, \push_lr
+	.elseif \last == 2
+	 \reglist_op \first, 2, \push_ip, \push_lr
+	.elseif \last == 3
+	 \reglist_op \first, 3, \push_ip, \push_lr
+	.elseif \last == 4
+	 \reglist_op \first, 4, \push_ip, \push_lr
+	.elseif \last == 5
+	 \reglist_op \first, 5, \push_ip, \push_lr
+	.elseif \last == 6
+	 \reglist_op \first, 6, \push_ip, \push_lr
+	.elseif \last == 7
+	 \reglist_op \first, 7, \push_ip, \push_lr
+	.elseif \last == 8
+	 \reglist_op \first, 8, \push_ip, \push_lr
+	.elseif \last == 9
+	 \reglist_op \first, 9, \push_ip, \push_lr
+	.elseif \last == 10
+	 \reglist_op \first, 10, \push_ip, \push_lr
+	.elseif \last == 11
+	 \reglist_op \first, 11, \push_ip, \push_lr
+	.else
+	 .error "last (\last) out of range"
+	.endif
+.endm
+
+/* Clean up expressions in 'first'.  */
+.macro _preprocess_reglist first:req, last, push_ip=0, push_lr=0, reglist_op:req
+	.ifb \last
+	 _preprocess_reglist \first \first \push_ip \push_lr
+	.else
+	 .if \first > \last
+	  .error "last (\last) must be at least as great as first (\first)"
+	 .endif
+	 .if \first == 0
+	  _preprocess_reglist1 0, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 1
+	  _preprocess_reglist1 1, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 2
+	  _preprocess_reglist1 2, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 3
+	  _preprocess_reglist1 3, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 4
+	  _preprocess_reglist1 4, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 5
+	  _preprocess_reglist1 5, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 6
+	  _preprocess_reglist1 6, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 7
+	  _preprocess_reglist1 7, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 8
+	  _preprocess_reglist1 8, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 9
+	  _preprocess_reglist1 9, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 10
+	  _preprocess_reglist1 10, \last, \push_ip, \push_lr, \reglist_op
+	 .elseif \first == 11
+	  _preprocess_reglist1 11, \last, \push_ip, \push_lr, \reglist_op
+	 .else
+	  .error "first (\first) out of range"
+	 .endif
+	.endif
+.endm
+
+.macro _align8 first, last, push_ip=0, push_lr=0, reglist_op=_prologue
+	.ifb \first
+	 .ifnb \last
+	  .error "can't have last (\last) without specifying first"
+	 .else // \last not blank
+	  .if ((\push_ip + \push_lr) % 2) == 0
+	   \reglist_op first=-1, last=-1, push_ip=\push_ip, push_lr=\push_lr
+	   .exitm
+	  .else // ((\push_ip + \push_lr) % 2) odd
+	   _align8 2, 2, \push_ip, \push_lr, \reglist_op
+	   .exitm
+	  .endif // ((\push_ip + \push_lr) % 2) == 0
+	 .endif // .ifnb \last
+	.endif // .ifb \first
+
+	.ifb \last
+	 _align8 \first, \first, \push_ip, \push_lr, \reglist_op
+	.else
+	 .if \push_ip & 1 <> \push_ip
+	  .error "push_ip may be 0 or 1"
+	 .endif
+	 .if \push_lr & 1 <> \push_lr
+	  .error "push_lr may be 0 or 1"
+	 .endif
+	 .ifeq (\last - \first + \push_ip + \push_lr) % 2
+	  .if \first == 0
+	   .error "Alignment required and first register is r0"
+	   .exitm
+	  .endif
+	  _preprocess_reglist \first-1, \last, \push_ip, \push_lr, \reglist_op
+	 .else
+	  _preprocess_reglist \first \last, \push_ip, \push_lr, \reglist_op
+	 .endif
+	.endif
+.endm
+
+.macro prologue first, last, push_ip=PAC_LEAF_PUSH_IP, push_lr=0, align8=STACK_ALIGN_ENFORCE
+	.if \align8
+	 _align8 \first, \last, \push_ip, \push_lr, _prologue
+	.else
+	 _prologue first=\first, last=\last, push_ip=\push_ip, push_lr=\push_lr
+	.endif
+.endm
+
+.macro epilogue first, last, push_ip=PAC_LEAF_PUSH_IP, push_lr=0, align8=STACK_ALIGN_ENFORCE
+	.if \align8
+	 _align8 \first, \last, \push_ip, \push_lr, reglist_op=_epilogue
+	.else
+	 _epilogue first=\first, last=\last, push_ip=\push_ip, push_lr=\push_lr
+	.endif
+.endm
+
+#define ENTRY_ALIGN(name, alignment)	\
+  .global name;		\
+  .type name,%function;	\
+  .align alignment;		\
+  name:			\
+  .fnstart;		\
+  .cfi_startproc;
+
+#define ENTRY(name)	ENTRY_ALIGN(name, 6)
+
+#define ENTRY_ALIAS(name)	\
+  .global name;		\
+  .type name,%function;	\
+  name:
+
+#if defined (IS_LEAF)
+# define END_UNWIND .cantunwind;
+#else
+# define END_UNWIND
+#endif
+
+#define END(name)	\
+  .cfi_endproc;		\
+  END_UNWIND		\
+  .fnend;		\
+  .size name, .-name;
+
+#define L(l) .L ## l
+
+#endif
diff --git a/string/arm/check-arch.S b/string/arm/check-arch.S
index 1cff934..9551671 100644
--- a/string/arm/check-arch.S
+++ b/string/arm/check-arch.S
@@ -1,10 +1,13 @@
 /*
  * check ARCH setting.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if !__arm__
 # error ARCH setting does not match the compiler.
 #endif
+
+/* For attributes that may affect ABI.  */
+#include "asmdefs.h"
diff --git a/string/arm/memchr.S b/string/arm/memchr.S
index 3f1ac4d..823d601 100644
--- a/string/arm/memchr.S
+++ b/string/arm/memchr.S
@@ -1,8 +1,8 @@
 /*
  * memchr - scan memory for a character
  *
- * Copyright (c) 2010-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2010-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /*
@@ -23,7 +23,11 @@
 @    Removed unneeded cbz from align loop
 
 	.syntax unified
+#if __ARM_ARCH >= 8 && __ARM_ARCH_PROFILE == 'M'
+    /* keep config inherited from -march= */
+#else
 	.arch armv7-a
+#endif
 
 @ this lets us check a flag in a 00/ff byte easily in either endianness
 #ifdef __ARMEB__
@@ -32,6 +36,8 @@
 #define CHARTSTMASK(c) 1<<(c*8)
 #endif
 	.thumb
+#include "asmdefs.h"
+
 
 @ ---------------------------------------------------------------------------
 	.thumb_func
@@ -39,11 +45,14 @@
 	.p2align 4,,15
 	.global __memchr_arm
 	.type __memchr_arm,%function
+	.fnstart
+	.cfi_startproc
 __memchr_arm:
 	@ r0 = start of memory to scan
 	@ r1 = character to look for
 	@ r2 = length
 	@ returns r0 = pointer to character or NULL if not found
+	prologue
 	and	r1,r1,#0xff	@ Don't think we can trust the caller to actually pass a char
 
 	cmp	r2,#16		@ If it's short don't bother with anything clever
@@ -64,6 +73,11 @@
 10:
 	@ At this point, we are aligned, we know we have at least 8 bytes to work with
 	push	{r4,r5,r6,r7}
+	.cfi_adjust_cfa_offset 16
+	.cfi_rel_offset 4, 0
+	.cfi_rel_offset 5, 4
+	.cfi_rel_offset 6, 8
+	.cfi_rel_offset 7, 12
 	orr	r1, r1, r1, lsl #8	@ expand the match word across to all bytes
 	orr	r1, r1, r1, lsl #16
 	bic	r4, r2, #7	@ Number of double words to work with
@@ -83,6 +97,11 @@
 	bne	15b		@ (Flags from the subs above) If not run out of bytes then go around again
 
 	pop	{r4,r5,r6,r7}
+	.cfi_restore 7
+	.cfi_restore 6
+	.cfi_restore 5
+	.cfi_restore 4
+	.cfi_adjust_cfa_offset -16
 	and	r1,r1,#0xff	@ Get r1 back to a single character from the expansion above
 	and	r2,r2,#7	@ Leave the count remaining as the number after the double words have been done
  
@@ -97,16 +116,25 @@
 	bne	21b		@ on r2 flags
 
 40:
+	.cfi_remember_state
 	movs	r0,#0		@ not found
-	bx	lr
+	epilogue
 
 50:
+	.cfi_restore_state
+	.cfi_remember_state
 	subs	r0,r0,#1	@ found
-	bx	lr
+	epilogue
 
 60:  @ We're here because the fast path found a hit - now we have to track down exactly which word it was
 	@ r0 points to the start of the double word after the one that was tested
 	@ r5 has the 00/ff pattern for the first word, r6 has the chained value
+	.cfi_restore_state	@ Standard post-prologue state
+	.cfi_adjust_cfa_offset 16
+	.cfi_rel_offset	4, 0
+	.cfi_rel_offset 5, 4
+	.cfi_rel_offset 6, 8
+	.cfi_rel_offset 7, 12
 	cmp	r5, #0
 	itte	eq
 	moveq	r5, r6		@ the end is in the 2nd word
@@ -126,7 +154,15 @@
 
 61:
 	pop	{r4,r5,r6,r7}
+	.cfi_restore 7
+	.cfi_restore 6
+	.cfi_restore 5
+	.cfi_restore 4
+	.cfi_adjust_cfa_offset -16
 	subs	r0,r0,#1
-	bx	lr
+	epilogue
+	.cfi_endproc
+	.cantunwind
+	.fnend
 
 	.size	__memchr_arm, . - __memchr_arm
diff --git a/string/arm/memcpy.S b/string/arm/memcpy.S
index 86e6493..2423cfd 100644
--- a/string/arm/memcpy.S
+++ b/string/arm/memcpy.S
@@ -1,8 +1,8 @@
 /*
  * memcpy - copy memory area
  *
- * Copyright (c) 2013-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2013-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /*
@@ -17,7 +17,7 @@
 
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 	.syntax unified
 	/* This implementation requires ARM state.  */
diff --git a/string/arm/memset.S b/string/arm/memset.S
index 11e9273..487b9d6 100644
--- a/string/arm/memset.S
+++ b/string/arm/memset.S
@@ -2,7 +2,7 @@
  * memset - fill memory with a constant
  *
  * Copyright (c) 2010-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /*
diff --git a/string/arm/strcmp-armv6m.S b/string/arm/strcmp-armv6m.S
index b75d414..4d55306 100644
--- a/string/arm/strcmp-armv6m.S
+++ b/string/arm/strcmp-armv6m.S
@@ -1,10 +1,12 @@
 /*
  * strcmp for ARMv6-M (optimized for performance, not size)
  *
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
+#include "asmdefs.h"
+
 #if __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1
 
 	.thumb_func
diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S
index 51443e3..74b3d23 100644
--- a/string/arm/strcmp.S
+++ b/string/arm/strcmp.S
@@ -1,8 +1,8 @@
 /*
  * strcmp for ARMv7
  *
- * Copyright (c) 2012-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
@@ -12,7 +12,7 @@
    is sufficiently aligned.  Use saturating arithmetic to optimize
    the compares.  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 /* Build Options:
    STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
@@ -26,6 +26,11 @@
 
 #define STRCMP_NO_PRECHECK	0
 
+/* Ensure the .cantunwind directive is prepended to .fnend.
+   Leaf functions cannot throw exceptions - EHABI only supports
+   synchronous exceptions.  */
+#define IS_LEAF
+
 	/* This version uses Thumb-2 code.  */
 	.thumb
 	.syntax unified
@@ -98,8 +103,9 @@
 	ldrd	r4, r5, [sp], #16
 	.cfi_restore 4
 	.cfi_restore 5
+	.cfi_adjust_cfa_offset -16
 	sub	result, result, r1, lsr #24
-	bx	lr
+	epilogue push_ip=HAVE_PAC_LEAF
 #else
 	/* To use the big-endian trick we'd have to reverse all three words.
 	   that's slower than this approach.  */
@@ -119,21 +125,15 @@
 	ldrd	r4, r5, [sp], #16
 	.cfi_restore 4
 	.cfi_restore 5
+	.cfi_adjust_cfa_offset -16
 	sub	result, result, r1
 
-	bx	lr
+	epilogue push_ip=HAVE_PAC_LEAF
 #endif
 	.endm
 
-	.p2align	5
-L(strcmp_start_addr):
-#if STRCMP_NO_PRECHECK == 0
-L(fastpath_exit):
-	sub	r0, r2, r3
-	bx	lr
-	nop
-#endif
-ENTRY_ALIGN (__strcmp_arm, 0)
+ENTRY(__strcmp_arm)
+	prologue push_ip=HAVE_PAC_LEAF
 #if STRCMP_NO_PRECHECK == 0
 	ldrb	r2, [src1]
 	ldrb	r3, [src2]
@@ -143,13 +143,13 @@
 	bne	L(fastpath_exit)
 #endif
 	strd	r4, r5, [sp, #-16]!
-	.cfi_def_cfa_offset 16
-	.cfi_offset 4, -16
-	.cfi_offset 5, -12
+	.cfi_adjust_cfa_offset 16
+	.cfi_rel_offset 4, 0
+	.cfi_rel_offset 5, 4
 	orr	tmp1, src1, src2
 	strd	r6, r7, [sp, #8]
-	.cfi_offset 6, -8
-	.cfi_offset 7, -4
+	.cfi_rel_offset 6, 8
+	.cfi_rel_offset 7, 12
 	mvn	const_m1, #0
 	lsl	r2, tmp1, #29
 	cbz	r2, L(loop_aligned8)
@@ -318,10 +318,19 @@
 	mov	result, tmp1
 	ldr	r4, [sp], #16
 	.cfi_restore 4
-	bx	lr
+	.cfi_adjust_cfa_offset -16
+	epilogue push_ip=HAVE_PAC_LEAF
 
 #if STRCMP_NO_PRECHECK == 0
+L(fastpath_exit):
+	.cfi_restore_state
+	.cfi_remember_state
+	sub	r0, r2, r3
+	epilogue push_ip=HAVE_PAC_LEAF
+
 L(aligned_m1):
+	.cfi_restore_state
+	.cfi_remember_state
 	add	src2, src2, #4
 #endif
 L(src1_aligned):
@@ -368,9 +377,9 @@
 	/* R6/7 Not used in this sequence.  */
 	.cfi_restore 6
 	.cfi_restore 7
+	.cfi_adjust_cfa_offset -16
 	neg	result, result
-	bx	lr
-
+	epilogue push_ip=HAVE_PAC_LEAF
 6:
 	.cfi_restore_state
 	S2LO	data1, data1, #24
@@ -445,7 +454,8 @@
 	/* R6/7 not used in this sequence.  */
 	.cfi_restore 6
 	.cfi_restore 7
-	bx	lr
+	.cfi_adjust_cfa_offset -16
+	epilogue push_ip=HAVE_PAC_LEAF
 
 L(strcmp_tail):
 	.cfi_restore_state
@@ -467,8 +477,9 @@
 	/* R6/7 not used in this sequence.  */
 	.cfi_restore 6
 	.cfi_restore 7
+	.cfi_adjust_cfa_offset -16
 	sub	result, result, data2, lsr #24
-	bx	lr
+	epilogue push_ip=HAVE_PAC_LEAF
 
 END (__strcmp_arm)
 
diff --git a/string/arm/strcpy.c b/string/arm/strcpy.c
index 02cf94f..b5728a2 100644
--- a/string/arm/strcpy.c
+++ b/string/arm/strcpy.c
@@ -2,7 +2,7 @@
  * strcpy
  *
  * Copyright (c) 2008-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if defined (__thumb2__) && !defined (__thumb__)
diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S
index 5ad30c9..5eb8671 100644
--- a/string/arm/strlen-armv6t2.S
+++ b/string/arm/strlen-armv6t2.S
@@ -1,8 +1,8 @@
 /*
  * strlen - calculate the length of a string
  *
- * Copyright (c) 2010-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2010-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
@@ -13,7 +13,7 @@
 
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #ifdef __ARMEB__
 #define S2LO		lsl
@@ -23,6 +23,11 @@
 #define S2HI		lsl
 #endif
 
+/* Ensure the .cantunwind directive is prepended to .fnend.
+   Leaf functions cannot throw exceptions - EHABI only supports
+   synchronous exceptions.  */
+#define IS_LEAF
+
 	/* This code requires Thumb.  */
 	.thumb
 	.syntax unified
@@ -41,8 +46,8 @@
 #define tmp2		r5
 
 ENTRY (__strlen_armv6t2)
+	prologue 4 5 push_ip=HAVE_PAC_LEAF
 	pld	[srcin, #0]
-	strd	r4, r5, [sp, #-8]!
 	bic	src, srcin, #7
 	mvn	const_m1, #0
 	ands	tmp1, srcin, #7		/* (8 - bytes) to alignment.  */
@@ -92,6 +97,7 @@
 	beq	L(loop_aligned)
 
 L(null_found):
+	.cfi_remember_state
 	cmp	data1a, #0
 	itt	eq
 	addeq	result, result, #4
@@ -100,11 +106,11 @@
 	rev	data1a, data1a
 #endif
 	clz	data1a, data1a
-	ldrd	r4, r5, [sp], #8
 	add	result, result, data1a, lsr #3	/* Bits -> Bytes.  */
-	bx	lr
+	epilogue 4 5 push_ip=HAVE_PAC_LEAF
 
 L(misaligned8):
+	.cfi_restore_state
 	ldrd	data1a, data1b, [src]
 	and	tmp2, tmp1, #3
 	rsb	result, tmp1, #0
diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c
index d5d4ea7..1468663 100644
--- a/string/bench/memcpy.c
+++ b/string/bench/memcpy.c
@@ -1,8 +1,8 @@
 /*
  * memcpy benchmark.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define _GNU_SOURCE
@@ -13,14 +13,15 @@
 #include "stringlib.h"
 #include "benchlib.h"
 
-#define ITERS 5000
+#define ITERS  5000
 #define ITERS2 20000000
-#define ITERS3 500000
-#define MAX_COPIES 8192
-#define SIZE (256*1024)
+#define ITERS3 200000
+#define NUM_TESTS 16384
+#define MIN_SIZE 32768
+#define MAX_SIZE (1024 * 1024)
 
-static uint8_t a[SIZE + 4096] __attribute__((__aligned__(64)));
-static uint8_t b[SIZE + 4096] __attribute__((__aligned__(64)));
+static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
+static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
 
 #define F(x) {#x, x},
 
@@ -30,15 +31,18 @@
   void *(*fun)(void *, const void *, size_t);
 } funtab[] =
 {
-  F(memcpy)
 #if __aarch64__
   F(__memcpy_aarch64)
 # if __ARM_NEON
   F(__memcpy_aarch64_simd)
 # endif
+# if __ARM_FEATURE_SVE
+  F(__memcpy_aarch64_sve)
+# endif
 #elif __arm__
   F(__memcpy_arm)
 #endif
+  F(memcpy)
 #undef F
   {0, 0}
 };
@@ -109,7 +113,7 @@
   uint64_t len : 16;
 } copy_t;
 
-static copy_t copy[MAX_COPIES];
+static copy_t test_arr[NUM_TESTS];
 
 typedef char *(*proto_t) (char *, const char *, size_t);
 
@@ -140,14 +144,14 @@
   size_t total = 0;
   /* Create a random set of copies with the given size and alignment
      distributions.  */
-  for (int i = 0; i < MAX_COPIES; i++)
+  for (int i = 0; i < NUM_TESTS; i++)
     {
-      copy[i].dst = (rand32 (0) & (max_size - 1));
-      copy[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];
-      copy[i].src = (rand32 (0) & (max_size - 1));
-      copy[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];
-      copy[i].len = size_arr[rand32 (0) & SIZE_MASK];
-      total += copy[i].len;
+      test_arr[i].dst = (rand32 (0) & (max_size - 1));
+      test_arr[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];
+      test_arr[i].src = (rand32 (0) & (max_size - 1));
+      test_arr[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];
+      test_arr[i].len = size_arr[rand32 (0) & SIZE_MASK];
+      total += test_arr[i].len;
     }
 
   return total;
@@ -160,25 +164,27 @@
   memset (a, 1, sizeof (a));
   memset (b, 2, sizeof (b));
 
-  printf("Random memcpy:\n");
+  printf("Random memcpy (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
       size_t total = 0;
       uint64_t tsum = 0;
-      printf ("%22s (B/ns) ", funtab[f].name);
+      printf ("%22s ", funtab[f].name);
       rand32 (0x12345678);
 
-      for (int size = 16384; size <= SIZE; size *= 2)
+      for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
 	{
 	  size_t copy_size = init_copies (size) * ITERS;
 
-	  for (int c = 0; c < MAX_COPIES; c++)
-	    funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);
+	  for (int c = 0; c < NUM_TESTS; c++)
+	    funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
+			   test_arr[c].len);
 
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS; i++)
-	    for (int c = 0; c < MAX_COPIES; c++)
-	      funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);
+	    for (int c = 0; c < NUM_TESTS; c++)
+	      funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
+			     test_arr[c].len);
 	  t = clock_get_ns () - t;
 	  total += copy_size;
 	  tsum += t;
@@ -187,74 +193,147 @@
       printf( "avg %.2f\n", (double)total / tsum);
     }
 
-  printf ("\nMedium memcpy:\n");
+  size_t total = 0;
+  uint64_t tsum = 0;
+  printf ("%22s ", "memcpy_call");
+  rand32 (0x12345678);
+
+  for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
+    {
+      size_t copy_size = init_copies (size) * ITERS;
+
+      for (int c = 0; c < NUM_TESTS; c++)
+	memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
+
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS; i++)
+	for (int c = 0; c < NUM_TESTS; c++)
+	  memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
+      t = clock_get_ns () - t;
+      total += copy_size;
+      tsum += t;
+      printf ("%dK: %.2f ", size / 1024, (double)copy_size / t);
+    }
+  printf( "avg %.2f\n", (double)total / tsum);
+
+
+  printf ("\nAligned medium memcpy (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
-      printf ("%22s (B/ns) ", funtab[f].name);
+      printf ("%22s ", funtab[f].name);
 
-      for (int size = 16; size <= 512; size *= 2)
+      for (int size = 8; size <= 512; size *= 2)
 	{
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS2; i++)
 	    funtab[f].fun (b, a, size);
 	  t = clock_get_ns () - t;
-	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
-		  size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
+	  printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
 	}
       printf ("\n");
     }
 
-  printf ("\nLarge memcpy:\n");
+  printf ("%22s ", "memcpy_call");
+  for (int size = 8; size <= 512; size *= 2)
+    {
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS2; i++)
+	memcpy (b, a, size);
+      t = clock_get_ns () - t;
+      printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+    }
+  printf ("\n");
+
+
+  printf ("\nUnaligned medium memcpy (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
-      printf ("%22s (B/ns) ", funtab[f].name);
+      printf ("%22s ", funtab[f].name);
 
-      for (int size = 1024; size <= 32768; size *= 2)
+      for (int size = 8; size <= 512; size *= 2)
+	{
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS2; i++)
+	    funtab[f].fun (b + 3, a + 1, size);
+	  t = clock_get_ns () - t;
+	  printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("%22s ", "memcpy_call");
+  for (int size = 8; size <= 512; size *= 2)
+    {
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS2; i++)
+	memcpy (b + 3, a + 1, size);
+      t = clock_get_ns () - t;
+      printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+    }
+  printf ("\n");
+
+
+  printf ("\nLarge memcpy (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s ", funtab[f].name);
+
+      for (int size = 1024; size <= 65536; size *= 2)
 	{
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS3; i++)
 	    funtab[f].fun (b, a, size);
 	  t = clock_get_ns () - t;
-	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
-		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+	  printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
 	}
       printf ("\n");
     }
 
-  printf ("\nUnaligned forwards memmove:\n");
+  printf ("%22s ", "memcpy_call");
+  for (int size = 1024; size <= 65536; size *= 2)
+    {
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS3; i++)
+	memcpy (b, a, size);
+      t = clock_get_ns () - t;
+      printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
+    }
+  printf ("\n");
+
+
+  printf ("\nUnaligned forwards memmove (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
-      printf ("%22s (B/ns) ", funtab[f].name);
+      printf ("%22s ", funtab[f].name);
 
-      for (int size = 1024; size <= 32768; size *= 2)
+      for (int size = 1024; size <= 65536; size *= 2)
 	{
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS3; i++)
 	    funtab[f].fun (a, a + 256 + (i & 31), size);
 	  t = clock_get_ns () - t;
-	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
-		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+	  printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
 	}
       printf ("\n");
     }
 
 
-  printf ("\nUnaligned backwards memmove:\n");
+  printf ("\nUnaligned backwards memmove (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
-      printf ("%22s (B/ns) ", funtab[f].name);
+      printf ("%22s ", funtab[f].name);
 
-      for (int size = 1024; size <= 32768; size *= 2)
+      for (int size = 1024; size <= 65536; size *= 2)
 	{
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS3; i++)
 	    funtab[f].fun (a + 256 + (i & 31), a, size);
 	  t = clock_get_ns () - t;
-	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
-		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+	  printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
 	}
       printf ("\n");
     }
+  printf ("\n");
 
   return 0;
 }
diff --git a/string/bench/memset.c b/string/bench/memset.c
new file mode 100644
index 0000000..990e23b
--- /dev/null
+++ b/string/bench/memset.c
@@ -0,0 +1,243 @@
+/*
+ * memset benchmark.
+ *
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#define _GNU_SOURCE
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "stringlib.h"
+#include "benchlib.h"
+
+#define ITERS  5000
+#define ITERS2 20000000
+#define ITERS3 1000000
+#define NUM_TESTS 16384
+#define MIN_SIZE 32768
+#define MAX_SIZE (1024 * 1024)
+
+static uint8_t a[MAX_SIZE + 4096] __attribute__((__aligned__(64)));
+
+#define F(x) {#x, x},
+
+static const struct fun
+{
+  const char *name;
+  void *(*fun)(void *, int, size_t);
+} funtab[] =
+{
+#if __aarch64__
+  F(__memset_aarch64)
+#elif __arm__
+  F(__memset_arm)
+#endif
+  F(memset)
+#undef F
+  {0, 0}
+};
+
+typedef struct { uint32_t offset : 20, len : 12; } memset_test_t;
+static memset_test_t test_arr[NUM_TESTS];
+
+typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
+typedef struct { uint8_t align; uint16_t freq; } align_data_t;
+
+#define SIZE_NUM 65536
+#define SIZE_MASK (SIZE_NUM-1)
+static uint8_t len_arr[SIZE_NUM];
+
+/* Frequency data for memset sizes up to 4096 based on SPEC2017.  */
+static freq_data_t memset_len_freq[] =
+{
+{40,28817}, {32,15336}, { 16,3823}, {296,3545}, { 24,3454}, {  8,1412},
+{292,1202}, { 48, 927}, { 12, 613}, { 11, 539}, {284, 493}, {108, 414},
+{ 88, 380}, { 20, 295}, {312, 271}, { 72, 233}, {  2, 200}, {  4, 192},
+{ 15, 180}, { 14, 174}, { 13, 160}, { 56, 151}, { 36, 144}, { 64, 140},
+{4095,133}, { 10, 130}, {  9, 124}, {  3, 124}, { 28, 120}, {  0, 118},
+{288, 110}, {1152, 96}, {104,  90}, {  1,  86}, {832,  76}, {248,  74},
+{1024, 69}, {120,  64}, {512,  63}, {384,  60}, {  6,  59}, { 80,  54},
+{ 17,  50}, {  7,  49}, {520,  47}, {2048, 39}, {256,  37}, {864,  33},
+{1440, 28}, { 22,  27}, {2056, 24}, {260,  23}, { 68,  23}, {  5,  22},
+{ 18,  21}, {200,  18}, {2120, 18}, { 60,  17}, { 52,  16}, {336,  15},
+{ 44,  13}, {192,  13}, {160,  12}, {2064, 12}, {128,  12}, { 76,  11},
+{164,  11}, {152,  10}, {136,   9}, {488,   7}, { 96,   6}, {560,   6},
+{1016,  6}, {112,   5}, {232,   5}, {168,   5}, {952,   5}, {184,   5},
+{144,   4}, {252,   4}, { 84,   3}, {960,   3}, {3808,  3}, {244,   3},
+{280,   3}, {224,   3}, {156,   3}, {1088,  3}, {440,   3}, {216,   2},
+{304,   2}, { 23,   2}, { 25,   2}, { 26,   2}, {264,   2}, {328,   2},
+{1096,  2}, {240,   2}, {1104,  2}, {704,   2}, {1664,  2}, {360,   2},
+{808,   1}, {544,   1}, {236,   1}, {720,   1}, {368,   1}, {424,   1},
+{640,   1}, {1112,  1}, {552,   1}, {272,   1}, {776,   1}, {376,   1},
+{ 92,   1}, {536,   1}, {824,   1}, {496,   1}, {760,   1}, {792,   1},
+{504,   1}, {344,   1}, {1816,  1}, {880,   1}, {176,   1}, {320,   1},
+{352,   1}, {2008,  1}, {208,   1}, {408,   1}, {228,   1}, {2072,  1},
+{568,   1}, {220,   1}, {616,   1}, {600,   1}, {392,   1}, {696,   1},
+{2144,  1}, {1280,  1}, {2136,  1}, {632,   1}, {584,   1}, {456,   1},
+{472,   1}, {3440,  1}, {2088,  1}, {680,   1}, {2928,  1}, {212,   1},
+{648,   1}, {1752,  1}, {664,   1}, {3512,  1}, {1032,  1}, {528,   1},
+{4072,  1}, {204,   1}, {2880,  1}, {3392,  1}, {712,   1}, { 59,   1},
+{736,   1}, {592,   1}, {2520,  1}, {744,   1}, {196,   1}, {172,   1},
+{728,   1}, {2040,  1}, {1192,  1}, {3600,  1}, {0, 0}
+};
+
+#define ALIGN_NUM 1024
+#define ALIGN_MASK (ALIGN_NUM-1)
+static uint8_t align_arr[ALIGN_NUM];
+
+/* Alignment data for memset based on SPEC2017.  */
+static align_data_t memset_align_freq[] =
+{
+ {16, 338}, {8, 307}, {32, 148}, {64, 131}, {4, 72}, {1, 23}, {2, 5}, {0, 0}
+};
+
+static void
+init_memset_distribution (void)
+{
+  int i, j, freq, size, n;
+
+  for (n = i = 0; (freq = memset_len_freq[i].freq) != 0; i++)
+    for (j = 0, size = memset_len_freq[i].size; j < freq; j++)
+      len_arr[n++] = size;
+  assert (n == SIZE_NUM);
+
+  for (n = i = 0; (freq = memset_align_freq[i].freq) != 0; i++)
+    for (j = 0, size = memset_align_freq[i].align; j < freq; j++)
+      align_arr[n++] = size - 1;
+  assert (n == ALIGN_NUM);
+}
+
+static size_t
+init_memset (size_t max_size)
+{
+  size_t total = 0;
+  /* Create a random set of memsets with the given size and alignment
+     distributions.  */
+  for (int i = 0; i < NUM_TESTS; i++)
+    {
+      test_arr[i].offset = (rand32 (0) & (max_size - 1));
+      test_arr[i].offset &= ~align_arr[rand32 (0) & ALIGN_MASK];
+      test_arr[i].len = len_arr[rand32 (0) & SIZE_MASK];
+      total += test_arr[i].len;
+    }
+
+  return total;
+}
+
+
+int main (void)
+{
+  init_memset_distribution ();
+
+  memset (a, 1, sizeof (a));
+
+  printf("Random memset (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      size_t total_size = 0;
+      uint64_t tsum = 0;
+      printf ("%22s ", funtab[f].name);
+      rand32 (0x12345678);
+
+      for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
+	{
+	  size_t memset_size = init_memset (size) * ITERS;
+
+	  for (int c = 0; c < NUM_TESTS; c++)
+	    funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);
+
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS; i++)
+	    for (int c = 0; c < NUM_TESTS; c++)
+	      funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);
+	  t = clock_get_ns () - t;
+	  total_size += memset_size;
+	  tsum += t;
+	  printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);
+	}
+      printf( "avg %.2f\n", (double)total_size / tsum);
+    }
+
+  size_t total_size = 0;
+  uint64_t tsum = 0;
+  printf ("%22s ", "memset_call");
+  rand32 (0x12345678);
+
+  for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
+    {
+      size_t memset_size = init_memset (size) * ITERS;
+
+      for (int c = 0; c < NUM_TESTS; c++)
+	memset (a + test_arr[c].offset, 0, test_arr[c].len);
+
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS; i++)
+	for (int c = 0; c < NUM_TESTS; c++)
+	  memset (a + test_arr[c].offset, 0, test_arr[c].len);
+      t = clock_get_ns () - t;
+      total_size += memset_size;
+      tsum += t;
+      printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);
+    }
+  printf( "avg %.2f\n", (double)total_size / tsum);
+
+
+  printf ("\nMedium memset (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s ", funtab[f].name);
+
+      for (int size = 8; size <= 512; size *= 2)
+	{
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS2; i++)
+	    funtab[f].fun (a, 0, size);
+	  t = clock_get_ns () - t;
+	  printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("%22s ", "memset_call");
+  for (int size = 8; size <= 512; size *= 2)
+    {
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS2; i++)
+	memset (a, 0, size);
+      t = clock_get_ns () - t;
+      printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+    }
+
+
+  printf ("\nLarge memset (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s ", funtab[f].name);
+
+      for (int size = 1024; size <= 65536; size *= 2)
+	{
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS3; i++)
+	    funtab[f].fun (a, 0, size);
+	  t = clock_get_ns () - t;
+	  printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("%22s ", "memset_call");
+  for (int size = 1024; size <= 65536; size *= 2)
+    {
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS3; i++)
+	memset (a, 0, size);
+      t = clock_get_ns () - t;
+      printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
+    }
+  printf ("\n\n");
+
+  return 0;
+}
diff --git a/string/bench/strlen.c b/string/bench/strlen.c
index cc0f04b..f05d0d5 100644
--- a/string/bench/strlen.c
+++ b/string/bench/strlen.c
@@ -1,8 +1,8 @@
 /*
  * strlen benchmark.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define _GNU_SOURCE
@@ -13,10 +13,10 @@
 #include "stringlib.h"
 #include "benchlib.h"
 
-#define ITERS 2000
+#define ITERS 5000
 #define ITERS2 20000000
 #define ITERS3 2000000
-#define NUM_STRLEN 16384
+#define NUM_TESTS 16384
 
 #define MAX_ALIGN 32
 #define MAX_STRLEN 256
@@ -49,7 +49,7 @@
 };
 #undef F
 
-static uint16_t strlen_tests[NUM_STRLEN];
+static uint16_t strlen_tests[NUM_TESTS];
 
 typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
 typedef struct { uint8_t align; uint16_t freq; } align_data_t;
@@ -117,7 +117,7 @@
 
   /* Create a random set of strlen input strings using the string length
      and alignment distributions.  */
-  for (int n = 0; n < NUM_STRLEN; n++)
+  for (int n = 0; n < NUM_TESTS; n++)
     {
       int align = strlen_align_arr[rand32 (0) & ALIGN_MASK];
       int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK];
@@ -141,14 +141,14 @@
       size_t res = 0, strlen_size = 0, mask = maskv;
       printf ("%22s ", funtab[f].name);
 
-      for (int c = 0; c < NUM_STRLEN; c++)
+      for (int c = 0; c < NUM_TESTS; c++)
 	strlen_size += funtab[f].fun (a + strlen_tests[c]);
       strlen_size *= ITERS;
 
       /* Measure latency of strlen result with (res & mask).  */
       uint64_t t = clock_get_ns ();
       for (int i = 0; i < ITERS; i++)
-	for (int c = 0; c < NUM_STRLEN; c++)
+	for (int c = 0; c < NUM_TESTS; c++)
 	  res = funtab[f].fun (a + strlen_tests[c] + (res & mask));
       t = clock_get_ns () - t;
       printf ("%.2f\n", (double)strlen_size / t);
diff --git a/string/include/benchlib.h b/string/include/benchlib.h
index 0f2ce2e..f1bbea3 100644
--- a/string/include/benchlib.h
+++ b/string/include/benchlib.h
@@ -2,7 +2,7 @@
  * Benchmark support functions.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
index 378c3cd..f41a464 100644
--- a/string/include/stringlib.h
+++ b/string/include/stringlib.h
@@ -1,8 +1,8 @@
 /*
  * Public API.
  *
- * Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stddef.h>
@@ -29,19 +29,17 @@
 size_t __strnlen_aarch64 (const char *, size_t);
 int __strncmp_aarch64 (const char *, const char *, size_t);
 void * __memchr_aarch64_mte (const void *, int, size_t);
-char *__strcpy_aarch64_mte (char *__restrict, const char *__restrict);
-char *__stpcpy_aarch64_mte (char *__restrict, const char *__restrict);
 char *__strchr_aarch64_mte (const char *, int);
 char * __strchrnul_aarch64_mte (const char *, int );
 size_t __strlen_aarch64_mte (const char *);
 char *__strrchr_aarch64_mte (const char *, int);
-int __strcmp_aarch64_mte (const char *, const char *);
-int __strncmp_aarch64_mte (const char *, const char *, size_t);
 #if __ARM_NEON
 void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t);
 void *__memmove_aarch64_simd (void *, const void *, size_t);
 #endif
 # if __ARM_FEATURE_SVE
+void *__memcpy_aarch64_sve (void *__restrict, const void *__restrict, size_t);
+void *__memmove_aarch64_sve (void *__restrict, const void *__restrict, size_t);
 void *__memchr_aarch64_sve (const void *, int, size_t);
 int __memcmp_aarch64_sve (const void *, const void *, size_t);
 char *__strchr_aarch64_sve (const char *, int);
diff --git a/string/test/__mtag_tag_region.c b/string/test/__mtag_tag_region.c
index d8c02d9..c45fa66 100644
--- a/string/test/__mtag_tag_region.c
+++ b/string/test/__mtag_tag_region.c
@@ -2,7 +2,7 @@
  * __mtag_tag_region test.
  *
  * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
diff --git a/string/test/__mtag_tag_zero_region.c b/string/test/__mtag_tag_zero_region.c
index 221c223..a4a7861 100644
--- a/string/test/__mtag_tag_zero_region.c
+++ b/string/test/__mtag_tag_zero_region.c
@@ -2,7 +2,7 @@
  * __mtag_tag_zero_region test.
  *
  * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
diff --git a/string/test/memchr.c b/string/test/memchr.c
index 0ff77f5..c6a9448 100644
--- a/string/test/memchr.c
+++ b/string/test/memchr.c
@@ -2,7 +2,7 @@
  * memchr test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/string/test/memcmp.c b/string/test/memcmp.c
index 7a7cf9c..f9236b8 100644
--- a/string/test/memcmp.c
+++ b/string/test/memcmp.c
@@ -2,7 +2,7 @@
  * memcmp test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/string/test/memcpy.c b/string/test/memcpy.c
index ce0ceee..fa15a95 100644
--- a/string/test/memcpy.c
+++ b/string/test/memcpy.c
@@ -1,8 +1,8 @@
 /*
  * memcpy test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
@@ -28,6 +28,9 @@
 # if __ARM_NEON
   F(__memcpy_aarch64_simd, 1)
 # endif
+# if __ARM_FEATURE_SVE
+  F(__memcpy_aarch64_sve, 1)
+# endif
 #elif __arm__
   F(__memcpy_arm, 0)
 #endif
diff --git a/string/test/memmove.c b/string/test/memmove.c
index 689b68c..5d509c0 100644
--- a/string/test/memmove.c
+++ b/string/test/memmove.c
@@ -1,8 +1,8 @@
 /*
  * memmove test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
@@ -28,6 +28,9 @@
 # if __ARM_NEON
   F(__memmove_aarch64_simd, 1)
 # endif
+# if __ARM_FEATURE_SVE
+  F(__memmove_aarch64_sve, 1)
+# endif
 #endif
   {0, 0, 0}
   // clang-format on
diff --git a/string/test/memrchr.c b/string/test/memrchr.c
index adf96f0..4171a56 100644
--- a/string/test/memrchr.c
+++ b/string/test/memrchr.c
@@ -2,7 +2,7 @@
  * memchr test.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _GNU_SOURCE
diff --git a/string/test/memset.c b/string/test/memset.c
index f172144..5543f44 100644
--- a/string/test/memset.c
+++ b/string/test/memset.c
@@ -2,7 +2,7 @@
  * memset test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/string/test/mte.h b/string/test/mte.h
index e67cbd9..40b0ecf 100644
--- a/string/test/mte.h
+++ b/string/test/mte.h
@@ -2,7 +2,7 @@
  * Memory tagging testing code.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef __TEST_MTE_H
diff --git a/string/test/stpcpy.c b/string/test/stpcpy.c
index 1827e68..0300892 100644
--- a/string/test/stpcpy.c
+++ b/string/test/stpcpy.c
@@ -1,8 +1,8 @@
 /*
  * stpcpy test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _GNU_SOURCE
@@ -28,8 +28,7 @@
   // clang-format off
   F(stpcpy, 0)
 #if __aarch64__
-  F(__stpcpy_aarch64, 0)
-  F(__stpcpy_aarch64_mte, 1)
+  F(__stpcpy_aarch64, 1)
 # if __ARM_FEATURE_SVE
   F(__stpcpy_aarch64_sve, 1)
 # endif
diff --git a/string/test/strchr.c b/string/test/strchr.c
index f3ae982..66180ac 100644
--- a/string/test/strchr.c
+++ b/string/test/strchr.c
@@ -2,7 +2,7 @@
  * strchr test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/string/test/strchrnul.c b/string/test/strchrnul.c
index 6c30ab2..aad0bf5 100644
--- a/string/test/strchrnul.c
+++ b/string/test/strchrnul.c
@@ -2,7 +2,7 @@
  * strchrnul test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _GNU_SOURCE
diff --git a/string/test/strcmp.c b/string/test/strcmp.c
index d57b54e..4aa95f4 100644
--- a/string/test/strcmp.c
+++ b/string/test/strcmp.c
@@ -1,8 +1,8 @@
 /*
  * strcmp test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
@@ -24,8 +24,7 @@
   // clang-format off
   F(strcmp, 0)
 #if __aarch64__
-  F(__strcmp_aarch64, 0)
-  F(__strcmp_aarch64_mte, 1)
+  F(__strcmp_aarch64, 1)
 # if __ARM_FEATURE_SVE
   F(__strcmp_aarch64_sve, 1)
 # endif
diff --git a/string/test/strcpy.c b/string/test/strcpy.c
index e84cace..af297f9 100644
--- a/string/test/strcpy.c
+++ b/string/test/strcpy.c
@@ -1,8 +1,8 @@
 /*
  * strcpy test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
@@ -24,8 +24,7 @@
   // clang-format off
   F(strcpy, 0)
 #if __aarch64__
-  F(__strcpy_aarch64, 0)
-  F(__strcpy_aarch64_mte, 1)
+  F(__strcpy_aarch64, 1)
 # if __ARM_FEATURE_SVE
   F(__strcpy_aarch64_sve, 1)
 # endif
diff --git a/string/test/stringtest.h b/string/test/stringtest.h
index fe855fc..6bb7e1f 100644
--- a/string/test/stringtest.h
+++ b/string/test/stringtest.h
@@ -2,7 +2,7 @@
  * Common string test code.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <ctype.h>
diff --git a/string/test/strlen.c b/string/test/strlen.c
index 6278380..47ef3dc 100644
--- a/string/test/strlen.c
+++ b/string/test/strlen.c
@@ -1,15 +1,14 @@
 /*
  * strlen test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <sys/mman.h>
 #include <limits.h>
 #include "mte.h"
 #include "stringlib.h"
diff --git a/string/test/strncmp.c b/string/test/strncmp.c
index 018a8a4..4bbab6f 100644
--- a/string/test/strncmp.c
+++ b/string/test/strncmp.c
@@ -1,8 +1,8 @@
 /*
  * strncmp test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
@@ -24,8 +24,7 @@
   // clang-format off
   F(strncmp, 0)
 #if __aarch64__
-  F(__strncmp_aarch64, 0)
-  F(__strncmp_aarch64_mte, 1)
+  F(__strncmp_aarch64, 1)
 # if __ARM_FEATURE_SVE
   F(__strncmp_aarch64_sve, 1)
 # endif
diff --git a/string/test/strnlen.c b/string/test/strnlen.c
index 0dea00e..a800fd1 100644
--- a/string/test/strnlen.c
+++ b/string/test/strnlen.c
@@ -2,7 +2,7 @@
  * strnlen test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _GNU_SOURCE
diff --git a/string/test/strrchr.c b/string/test/strrchr.c
index fedbdc5..580ca49 100644
--- a/string/test/strrchr.c
+++ b/string/test/strrchr.c
@@ -2,7 +2,7 @@
  * strrchr test.
  *
  * Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/string/x86_64/check-arch.S b/string/x86_64/check-arch.S
index 26ade0a..5afcf7b 100644
--- a/string/x86_64/check-arch.S
+++ b/string/x86_64/check-arch.S
@@ -2,7 +2,7 @@
  * check ARCH setting.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if !__x86_64__