diff --git a/Eigen/Core b/Eigen/Core index 34838f5d7..93e7f5570 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -205,6 +205,12 @@ using std::ptrdiff_t; #include "src/Core/arch/Default/BFloat16.h" #include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h" +#if defined(EIGEN_VECTORIZE_GENERIC) && !defined(EIGEN_DONT_VECTORIZE) +#include "src/Core/arch/clang/PacketMath.h" +#include "src/Core/arch/clang/TypeCasting.h" +#include "src/Core/arch/clang/Reductions.h" +#include "src/Core/arch/clang/MathFunctions.h" +#else #if defined EIGEN_VECTORIZE_AVX512 #include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/Reductions.h" @@ -297,6 +303,8 @@ using std::ptrdiff_t; #endif #endif +#endif // #ifndef EIGEN_VECTORIZE_GENERIC + #include "src/Core/arch/Default/Settings.h" // This file provides generic implementations valid for scalar as well #include "src/Core/arch/Default/GenericPacketMathFunctions.h" diff --git a/Eigen/Geometry b/Eigen/Geometry index efe3e1fa3..fd81ae2c7 100644 --- a/Eigen/Geometry +++ b/Eigen/Geometry @@ -48,10 +48,13 @@ #include "src/Geometry/AlignedBox.h" #include "src/Geometry/Umeyama.h" +#ifndef EIGEN_VECTORIZE_GENERIC +// TODO(rmlarsen): Make these work with generic vectorization if possible. // Use the SSE optimized version whenever possible. #if (defined EIGEN_VECTORIZE_SSE) || (defined EIGEN_VECTORIZE_NEON) #include "src/Geometry/arch/Geometry_SIMD.h" #endif +#endif // IWYU pragma: end_exports #include "src/Core/util/ReenableStupidWarnings.h" diff --git a/Eigen/LU b/Eigen/LU index d80448039..64dcdee60 100644 --- a/Eigen/LU +++ b/Eigen/LU @@ -36,9 +36,12 @@ #include "src/LU/Determinant.h" #include "src/LU/InverseImpl.h" +#ifndef EIGEN_VECTORIZE_GENERIC +// TODO(rmlarsen): Make these work with generic vectorization if possible. #if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_NEON #include "src/LU/arch/InverseSize4.h" #endif +#endif // IWYU pragma: end_exports #include "src/Core/util/ReenableStupidWarnings.h" diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 5c8bbce6d..bfb52abe4 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -1224,7 +1224,7 @@ EIGEN_DEVICE_FUNC inline typename unpacket_traits::type pfirst(const Pac template EIGEN_DEVICE_FUNC inline std::conditional_t<(unpacket_traits::size % 8) == 0, typename unpacket_traits::half, Packet> -predux_half_dowto4(const Packet& a) { +predux_half(const Packet& a) { return a; } @@ -1342,7 +1342,7 @@ struct pmadd_impl { return psub(c, pmul(a, b)); } static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pnmsub(const Packet& a, const Packet& b, const Packet& c) { - return pnegate(pmadd(a, b, c)); + return pnegate(padd(pmul(a, b), c)); } }; @@ -1476,8 +1476,8 @@ struct PacketBlock { Packet packet[N]; }; -template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& /*kernel*/) { +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& /*kernel*/) { // Nothing to do in the scalar case, i.e. a 1x1 matrix. } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 82caebb9b..48fe815de 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -1934,15 +1934,15 @@ EIGEN_STRONG_INLINE Packet4d pldexp_fast(const Packet4d& a, const Pack } template <> -EIGEN_STRONG_INLINE Packet4f predux_half_dowto4(const Packet8f& a) { +EIGEN_STRONG_INLINE Packet4f predux_half(const Packet8f& a) { return _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1)); } template <> -EIGEN_STRONG_INLINE Packet4i predux_half_dowto4(const Packet8i& a) { +EIGEN_STRONG_INLINE Packet4i predux_half(const Packet8i& a) { return _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1)); } template <> -EIGEN_STRONG_INLINE Packet4ui predux_half_dowto4(const Packet8ui& a) { +EIGEN_STRONG_INLINE Packet4ui predux_half(const Packet8ui& a) { return _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1)); } diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index b18105390..c8017e4f3 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -1498,7 +1498,7 @@ EIGEN_STRONG_INLINE Packet8d pldexp(const Packet8d& a, const Packet8d& #endif template <> -EIGEN_STRONG_INLINE Packet8f predux_half_dowto4(const Packet16f& a) { +EIGEN_STRONG_INLINE Packet8f predux_half(const Packet16f& a) { #ifdef EIGEN_VECTORIZE_AVX512DQ __m256 lane0 = _mm512_extractf32x8_ps(a, 0); __m256 lane1 = _mm512_extractf32x8_ps(a, 1); @@ -1514,13 +1514,13 @@ EIGEN_STRONG_INLINE Packet8f predux_half_dowto4(const Packet16f& a) { #endif } template <> -EIGEN_STRONG_INLINE Packet4d predux_half_dowto4(const Packet8d& a) { +EIGEN_STRONG_INLINE Packet4d predux_half(const Packet8d& a) { __m256d lane0 = _mm512_extractf64x4_pd(a, 0); __m256d lane1 = _mm512_extractf64x4_pd(a, 1); return _mm256_add_pd(lane0, lane1); } template <> -EIGEN_STRONG_INLINE Packet8i predux_half_dowto4(const Packet16i& a) { +EIGEN_STRONG_INLINE Packet8i predux_half(const Packet16i& a) { #ifdef EIGEN_VECTORIZE_AVX512DQ __m256i lane0 = _mm512_extracti32x8_epi32(a, 0); __m256i lane1 = _mm512_extracti32x8_epi32(a, 1); @@ -1537,7 +1537,7 @@ EIGEN_STRONG_INLINE Packet8i predux_half_dowto4(const Packet16i& a) { } template <> -EIGEN_STRONG_INLINE Packet4l predux_half_dowto4(const Packet8l& a) { +EIGEN_STRONG_INLINE Packet4l predux_half(const Packet8l& a) { __m256i lane0 = _mm512_extracti64x4_epi64(a, 0); __m256i lane1 = _mm512_extracti64x4_epi64(a, 1); return _mm256_add_epi64(lane0, lane1); @@ -2285,7 +2285,7 @@ EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet } template <> -EIGEN_STRONG_INLINE Packet8h predux_half_dowto4(const Packet16h& a) { +EIGEN_STRONG_INLINE Packet8h predux_half(const Packet16h& a) { Packet8h lane0 = _mm256_extractf128_si256(a, 0); Packet8h lane1 = _mm256_extractf128_si256(a, 1); return padd(lane0, lane1); @@ -2791,7 +2791,7 @@ EIGEN_STRONG_INLINE Packet16bf plset(const bfloat16& a) { } template <> -EIGEN_STRONG_INLINE Packet8bf predux_half_dowto4(const Packet16bf& a) { +EIGEN_STRONG_INLINE Packet8bf predux_half(const Packet16bf& a) { Packet8bf lane0 = _mm256_extractf128_si256(a, 0); Packet8bf lane1 = _mm256_extractf128_si256(a, 1); return padd(lane0, lane1); diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index a46a8eff0..827386fb0 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -695,7 +695,6 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Pack template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_double(const Packet _x) { - Packet x = _x; const Packet cst_zero = pset1(0.0); const Packet cst_1 = pset1(1.0); const Packet cst_2 = pset1(2.0); @@ -719,7 +718,8 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_double(const Pac // clamp x Packet zero_mask = pcmp_lt(_x, cst_exp_lo); - x = pmin(x, cst_exp_hi); + Packet x = pmin(_x, cst_exp_hi); + // Express exp(x) as exp(g + n*log(2)). fx = pmadd(cst_cephes_LOG2EF, x, cst_half); @@ -1352,7 +1352,6 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_double(const T& a_x) const T minus_clamp = pset1(-17.714196154005176); #endif const T x = pmax(pmin(a_x, plus_clamp), minus_clamp); - // The following rational approximation was generated by rminimax // (https://gitlab.inria.fr/sfilip/rminimax) using the following // command: diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index b9d0866a0..6f93b1513 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -3500,27 +3500,27 @@ EIGEN_STRONG_INLINE uint64_t predux(const Packet2ul& a) { #endif template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c& a) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half(const Packet8c& a) { return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a, vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0); } template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half_dowto4(const Packet16c& a) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half(const Packet16c& a) { return vadd_s8(vget_high_s8(a), vget_low_s8(a)); } template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half_dowto4(const Packet8uc& a) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half(const Packet8uc& a) { return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a, vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0); } template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half_dowto4(const Packet16uc& a) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half(const Packet16uc& a) { return vadd_u8(vget_high_u8(a), vget_low_u8(a)); } template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half_dowto4(const Packet8s& a) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half(const Packet8s& a) { return vadd_s16(vget_high_s16(a), vget_low_s16(a)); } template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(const Packet8us& a) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half(const Packet8us& a) { return vadd_u16(vget_high_u16(a), vget_low_u16(a)); } @@ -5401,7 +5401,7 @@ struct unpacket_traits : neon_unpacket_default { }; template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half_dowto4(const Packet8hf& a) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half(const Packet8hf& a) { return vadd_f16(vget_low_f16(a), vget_high_f16(a)); } diff --git a/Eigen/src/Core/arch/clang/MathFunctions.h b/Eigen/src/Core/arch/clang/MathFunctions.h new file mode 100644 index 000000000..706a87051 --- /dev/null +++ b/Eigen/src/Core/arch/clang/MathFunctions.h @@ -0,0 +1,47 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2025 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATH_FUNCTIONS_CLANG_H +#define EIGEN_MATH_FUNCTIONS_CLANG_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { + +template <> +EIGEN_STRONG_INLINE Packet16f pfrexp(const Packet16f& a, Packet16f& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE Packet8d pfrexp(const Packet8d& a, Packet8d& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pldexp(const Packet16f& a, const Packet16f& exponent) { + return pldexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE Packet8d pldexp(const Packet8d& a, const Packet8d& exponent) { + return pldexp_generic(a, exponent); +} + +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet16f) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(Packet8d) + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_CLANG_H diff --git a/Eigen/src/Core/arch/clang/PacketMath.h b/Eigen/src/Core/arch/clang/PacketMath.h new file mode 100644 index 000000000..3524ad653 --- /dev/null +++ b/Eigen/src/Core/arch/clang/PacketMath.h @@ -0,0 +1,869 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2025 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_CLANG_H +#define EIGEN_PACKET_MATH_CLANG_H + +namespace Eigen { +namespace internal { + +namespace detail { +// namespace detail contains implementation details specific to this +// file, while namespace internal contains internal APIs used elsewhere +// in Eigen. +template +using VectorType = ScalarT __attribute__((ext_vector_type(n), aligned(n * sizeof(ScalarT)))); +} // namespace detail + +// --- Primary packet type definitions (fixed at 64 bytes) --- + +// TODO(rmlarsen): Generalize to other vector sizes. +static_assert(EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64, "We currently assume the full vector size is 64 bytes"); +using Packet16f = detail::VectorType; +using Packet8d = detail::VectorType; +using Packet16i = detail::VectorType; +using Packet8l = detail::VectorType; + +// --- packet_traits specializations --- +template <> +struct packet_traits : default_packet_traits { + using type = Packet16f; + using half = Packet16f; + enum { + Vectorizable = 1, + size = 16, + AlignedOnScalar = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 1, + HasRound = 1, + HasMinMax = 1, + HasCmp = 1, + HasSet1 = 1, + HasCast = 1, + HasBitwise = 1, + HasRedux = 1, + HasSign = 1, + HasArg = 0, + HasConj = 1, + // Math functions + HasReciprocal = 1, + HasSin = 1, + HasCos = 1, + HasACos = 1, + HasASin = 1, + HasATan = 1, + HasATanh = 1, + HasLog = 1, + HasLog1p = 1, + HasExpm1 = 1, + HasExp = 1, + HasPow = 1, + HasNdtri = 1, + HasBessel = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasCbrt = 1, + HasTanh = 1, + HasErf = 1, + HasErfc = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + using type = Packet8d; + using half = Packet8d; + enum { + Vectorizable = 1, + size = 8, + AlignedOnScalar = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 1, + HasRound = 1, + HasMinMax = 1, + HasCmp = 1, + HasSet1 = 1, + HasCast = 1, + HasBitwise = 1, + HasRedux = 1, + HasSign = 1, + HasArg = 0, + HasConj = 1, + // Math functions + HasReciprocal = 1, + HasSin = 1, + HasCos = 1, + HasACos = 0, + HasASin = 0, + HasATan = 1, + HasATanh = 1, + HasLog = 1, + HasLog1p = 1, + HasExpm1 = 1, + HasExp = 1, + HasPow = 1, + HasNdtri = 1, + HasBessel = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasCbrt = 1, + HasTanh = 1, + HasErf = 1, + HasErfc = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + using type = Packet16i; + using half = Packet16i; + enum { + Vectorizable = 1, + size = 16, + AlignedOnScalar = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 1, + HasMinMax = 1, + HasCmp = 1, + HasSet1 = 1, + HasCast = 1, + HasBitwise = 1, + HasRedux = 1, + // Set remaining to 0 + HasRound = 1, + HasSqrt = 0, + HasRsqrt = 0, + HasReciprocal = 0, + HasArg = 0, + HasConj = 1, + HasExp = 0, + HasLog = 0, + HasSin = 0, + HasCos = 0, + }; +}; + +template <> +struct packet_traits : default_packet_traits { + using type = Packet8l; + using half = Packet8l; + enum { + Vectorizable = 1, + size = 8, + AlignedOnScalar = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 1, + HasMinMax = 1, + HasCmp = 1, + HasSet1 = 1, + HasCast = 1, + HasBitwise = 1, + HasRedux = 1, + // Set remaining to 0 + HasRound = 1, + HasSqrt = 0, + HasRsqrt = 0, + HasReciprocal = 0, + HasArg = 0, + HasConj = 1, + HasExp = 0, + HasLog = 0, + HasSin = 0, + HasCos = 0, + }; +}; + +// --- unpacket_traits specializations --- +template <> +struct unpacket_traits { + using type = float; + using half = Packet16f; + using integer_packet = Packet16i; + enum { + size = 16, + alignment = EIGEN_GENERIC_VECTOR_SIZE_BYTES, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template <> +struct unpacket_traits { + using type = double; + using half = Packet8d; + using integer_packet = Packet8l; + enum { + size = 8, + alignment = EIGEN_GENERIC_VECTOR_SIZE_BYTES, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template <> +struct unpacket_traits { + using type = int32_t; + using half = Packet16i; + enum { + size = 16, + alignment = EIGEN_GENERIC_VECTOR_SIZE_BYTES, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template <> +struct unpacket_traits { + using type = int64_t; + using half = Packet8l; + enum { + size = 8, + alignment = EIGEN_GENERIC_VECTOR_SIZE_BYTES, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +namespace detail { +// --- vector type helpers --- +template +struct ScalarTypeOfVector { + using type = std::remove_all_extents_t>; +}; + +template +using scalar_type_of_vector_t = typename ScalarTypeOfVector::type; + +template +struct UnsignedVectorHelpter { + static VectorType v; + static constexpr int n = __builtin_vectorelements(v); + using UnsignedScalar = std::make_unsigned_t>; + using type = UnsignedScalar __attribute__((ext_vector_type(n), aligned(n * sizeof(UnsignedScalar)))); +}; + +template +using unsigned_vector_t = typename UnsignedVectorHelpter::type; + +template +using HalfPacket = VectorType::type, unpacket_traits::size / 2>; + +template +using QuarterPacket = VectorType::type, unpacket_traits::size / 4>; + +// load and store helpers. +template +EIGEN_STRONG_INLINE VectorT load_vector_unaligned(const scalar_type_of_vector_t* from) { + VectorT to; + constexpr int n = __builtin_vectorelements(to); + for (int i = 0; i < n; ++i) { + to[i] = from[i]; + } + return to; +} + +template +EIGEN_STRONG_INLINE VectorT load_vector_aligned(const scalar_type_of_vector_t* from) { + return *reinterpret_cast(assume_aligned(from)); +} + +template +EIGEN_STRONG_INLINE void store_vector_unaligned(scalar_type_of_vector_t* to, const VectorT& from) { + constexpr int n = __builtin_vectorelements(from); + for (int i = 0; i < n; ++i) { + *to++ = from[i]; + } +} + +template +EIGEN_STRONG_INLINE void store_vector_aligned(scalar_type_of_vector_t* to, const VectorT& from) { + *reinterpret_cast(assume_aligned(to)) = from; +} + +} // namespace detail + +// --- Intrinsic-like specializations --- + +// --- Load/Store operations --- +#define EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PACKET_TYPE, SCALAR_TYPE) \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE ploadu(const SCALAR_TYPE* from) { \ + return detail::load_vector_unaligned(from); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pload(const SCALAR_TYPE* from) { \ + return detail::load_vector_aligned(from); \ + } \ + template <> \ + EIGEN_STRONG_INLINE void pstoreu(SCALAR_TYPE * to, const PACKET_TYPE& from) { \ + detail::store_vector_unaligned(to, from); \ + } \ + template <> \ + EIGEN_STRONG_INLINE void pstore(SCALAR_TYPE * to, const PACKET_TYPE& from) { \ + detail::store_vector_aligned(to, from); \ + } + +EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet16f, float) +EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet8d, double) +EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet16i, int32_t) +EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet8l, int64_t) +#undef EIGEN_CLANG_PACKET_LOAD_STORE_PACKET + +// --- Broadcast operation --- +template <> +EIGEN_STRONG_INLINE Packet16f pset1frombits(uint32_t from) { + return Packet16f(numext::bit_cast(from)); +} + +template <> +EIGEN_STRONG_INLINE Packet8d pset1frombits(uint64_t from) { + return Packet8d(numext::bit_cast(from)); +} + +#define EIGEN_CLANG_PACKET_SET1(PACKET_TYPE) \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pset1(const unpacket_traits::type& from) { \ + return PACKET_TYPE(from); \ + } \ + template <> \ + EIGEN_STRONG_INLINE unpacket_traits::type pfirst(const PACKET_TYPE& from) { \ + return from[0]; \ + } + +EIGEN_CLANG_PACKET_SET1(Packet16f) +EIGEN_CLANG_PACKET_SET1(Packet8d) +EIGEN_CLANG_PACKET_SET1(Packet16i) +EIGEN_CLANG_PACKET_SET1(Packet8l) +#undef EIGEN_CLANG_PACKET_SET1 + +// --- Arithmetic operations --- +#define EIGEN_CLANG_PACKET_ARITHMETIC(PACKET_TYPE) \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pisnan(const PACKET_TYPE& a) { \ + return reinterpret_cast(a != a); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pnegate(const PACKET_TYPE& a) { \ + return -a; \ + } + +EIGEN_CLANG_PACKET_ARITHMETIC(Packet16f) +EIGEN_CLANG_PACKET_ARITHMETIC(Packet8d) +EIGEN_CLANG_PACKET_ARITHMETIC(Packet16i) +EIGEN_CLANG_PACKET_ARITHMETIC(Packet8l) +#undef EIGEN_CLANG_PACKET_ARITHMETIC + +// --- Bitwise operations (via casting) --- + +namespace detail { + +// Note: pcast functions are not template specializations, just helpers +// identical to preinterpret. We duplicate them here to avoid a circular +// dependence with TypeCasting.h. +EIGEN_STRONG_INLINE Packet16i pcast_float_to_int(const Packet16f& a) { return reinterpret_cast(a); } +EIGEN_STRONG_INLINE Packet16f pcast_int_to_float(const Packet16i& a) { return reinterpret_cast(a); } +EIGEN_STRONG_INLINE Packet8l pcast_double_to_long(const Packet8d& a) { return reinterpret_cast(a); } +EIGEN_STRONG_INLINE Packet8d pcast_long_to_double(const Packet8l& a) { return reinterpret_cast(a); } + +} // namespace detail + +// Bitwise ops for integer packets +#define EIGEN_CLANG_PACKET_BITWISE_INT(PACKET_TYPE) \ + template <> \ + constexpr EIGEN_STRONG_INLINE PACKET_TYPE ptrue(const PACKET_TYPE& /*unused*/) { \ + return PACKET_TYPE(0) == PACKET_TYPE(0); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pand(const PACKET_TYPE& a, const PACKET_TYPE& b) { \ + return a & b; \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE por(const PACKET_TYPE& a, const PACKET_TYPE& b) { \ + return a | b; \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pxor(const PACKET_TYPE& a, const PACKET_TYPE& b) { \ + return a ^ b; \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pandnot(const PACKET_TYPE& a, const PACKET_TYPE& b) { \ + return a & ~b; \ + } \ + template \ + EIGEN_STRONG_INLINE PACKET_TYPE parithmetic_shift_right(const PACKET_TYPE& a) { \ + return a >> N; \ + } \ + template \ + EIGEN_STRONG_INLINE PACKET_TYPE plogical_shift_right(const PACKET_TYPE& a) { \ + using UnsignedT = detail::unsigned_vector_t; \ + return reinterpret_cast(reinterpret_cast(a) >> N); \ + } \ + template \ + EIGEN_STRONG_INLINE PACKET_TYPE plogical_shift_left(const PACKET_TYPE& a) { \ + return a << N; \ + } + +EIGEN_CLANG_PACKET_BITWISE_INT(Packet16i) +EIGEN_CLANG_PACKET_BITWISE_INT(Packet8l) +#undef EIGEN_CLANG_PACKET_BITWISE_INT + +// Bitwise ops for floating point packets +#define EIGEN_CLANG_PACKET_BITWISE_FLOAT(PACKET_TYPE, CAST_TO_INT, CAST_FROM_INT) \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE ptrue(const PACKET_TYPE& a) { \ + return CAST_FROM_INT(CAST_TO_INT(a) == CAST_TO_INT(a)); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pand(const PACKET_TYPE& a, const PACKET_TYPE& b) { \ + return CAST_FROM_INT(CAST_TO_INT(a) & CAST_TO_INT(b)); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE por(const PACKET_TYPE& a, const PACKET_TYPE& b) { \ + return CAST_FROM_INT(CAST_TO_INT(a) | CAST_TO_INT(b)); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pxor(const PACKET_TYPE& a, const PACKET_TYPE& b) { \ + return CAST_FROM_INT(CAST_TO_INT(a) ^ CAST_TO_INT(b)); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pandnot(const PACKET_TYPE& a, const PACKET_TYPE& b) { \ + return CAST_FROM_INT(CAST_TO_INT(a) & ~CAST_TO_INT(b)); \ + } + +EIGEN_CLANG_PACKET_BITWISE_FLOAT(Packet16f, detail::pcast_float_to_int, detail::pcast_int_to_float) +EIGEN_CLANG_PACKET_BITWISE_FLOAT(Packet8d, detail::pcast_double_to_long, detail::pcast_long_to_double) +#undef EIGEN_CLANG_PACKET_BITWISE_FLOAT + +// --- Min/Max operations --- +#if __has_builtin(__builtin_elementwise_min) && __has_builtin(__builtin_elementwise_max) && \ + __has_builtin(__builtin_elementwise_abs) +#define EIGEN_CLANG_PACKET_ELEMENTWISE(PACKET_TYPE) \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pmin(const PACKET_TYPE& a, const PACKET_TYPE& b) { \ + /* Match NaN propagation of std::min. */ \ + return a == a ? __builtin_elementwise_min(a, b) : a; \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pmax(const PACKET_TYPE& a, const PACKET_TYPE& b) { \ + /* Match NaN propagation of std::max. */ \ + return a == a ? __builtin_elementwise_max(a, b) : a; \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pmin(const PACKET_TYPE& a, const PACKET_TYPE& b) { \ + return __builtin_elementwise_min(a, b); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pmax(const PACKET_TYPE& a, const PACKET_TYPE& b) { \ + return __builtin_elementwise_max(a, b); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pmin(const PACKET_TYPE& a, const PACKET_TYPE& b) { \ + return a != a ? a : (b != b ? b : __builtin_elementwise_min(a, b)); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pmax(const PACKET_TYPE& a, const PACKET_TYPE& b) { \ + return a != a ? a : (b != b ? b : __builtin_elementwise_max(a, b)); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pabs(const PACKET_TYPE& a) { \ + return __builtin_elementwise_abs(a); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pselect(const PACKET_TYPE& mask, const PACKET_TYPE& a, \ + const PACKET_TYPE& b) { \ + return __builtin_elementwise_abs(mask) == 0 ? b : a; \ + } + +EIGEN_CLANG_PACKET_ELEMENTWISE(Packet16f) +EIGEN_CLANG_PACKET_ELEMENTWISE(Packet8d) +EIGEN_CLANG_PACKET_ELEMENTWISE(Packet16i) +EIGEN_CLANG_PACKET_ELEMENTWISE(Packet8l) +#undef EIGEN_CLANG_PACKET_ELEMENTWISE +#endif + +// --- Math functions (float/double only) --- + +#if __has_builtin(__builtin_elementwise_floor) && __has_builtin(__builtin_elementwise_ceil) && \ + __has_builtin(__builtin_elementwise_round) && __has_builtin(__builtin_elementwise_roundeven) && \ + __has_builtin(__builtin_elementwise_trunc) && __has_builtin(__builtin_elementwise_sqrt) +#define EIGEN_CLANG_PACKET_MATH_FLOAT(PACKET_TYPE) \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pfloor(const PACKET_TYPE& a) { \ + return __builtin_elementwise_floor(a); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pceil(const PACKET_TYPE& a) { \ + return __builtin_elementwise_ceil(a); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pround(const PACKET_TYPE& a) { \ + return __builtin_elementwise_round(a); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE print(const PACKET_TYPE& a) { \ + return __builtin_elementwise_roundeven(a); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE ptrunc(const PACKET_TYPE& a) { \ + return __builtin_elementwise_trunc(a); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE psqrt(const PACKET_TYPE& a) { \ + return __builtin_elementwise_sqrt(a); \ + } + +EIGEN_CLANG_PACKET_MATH_FLOAT(Packet16f) +EIGEN_CLANG_PACKET_MATH_FLOAT(Packet8d) +#undef EIGEN_CLANG_PACKET_MATH_FLOAT +#endif + +// --- Fused Multiply-Add (MADD) --- +#if defined(EIGEN_VECTORIZE_FMA) && __has_builtin(__builtin_elementwise_fma) +#define EIGEN_CLANG_PACKET_MADD(PACKET_TYPE) \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pmadd(const PACKET_TYPE& a, const PACKET_TYPE& b, \ + const PACKET_TYPE& c) { \ + return __builtin_elementwise_fma(a, b, c); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pmsub(const PACKET_TYPE& a, const PACKET_TYPE& b, \ + const PACKET_TYPE& c) { \ + return __builtin_elementwise_fma(a, b, -c); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pnmadd(const PACKET_TYPE& a, const PACKET_TYPE& b, \ + const PACKET_TYPE& c) { \ + return __builtin_elementwise_fma(-a, b, c); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pnmsub(const PACKET_TYPE& a, const PACKET_TYPE& b, \ + const PACKET_TYPE& c) { \ + return -(__builtin_elementwise_fma(a, b, c)); \ + } +#else +// Fallback if FMA builtin is not available +#define EIGEN_CLANG_PACKET_MADD(PACKET_TYPE) \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pmadd(const PACKET_TYPE& a, const PACKET_TYPE& b, \ + const PACKET_TYPE& c) { \ + return (a * b) + c; \ + } +#endif + +EIGEN_CLANG_PACKET_MADD(Packet16f) +EIGEN_CLANG_PACKET_MADD(Packet8d) +#undef EIGEN_CLANG_PACKET_MADD + +#define EIGEN_CLANG_PACKET_SCATTER_GATHER(PACKET_TYPE) \ + template <> \ + EIGEN_STRONG_INLINE void pscatter(unpacket_traits::type* to, const PACKET_TYPE& from, Index stride) { \ + constexpr int size = unpacket_traits::size; \ + for (int i = 0; i < size; ++i) { \ + to[i * stride] = from[i]; \ + } \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pgather::type, PACKET_TYPE>( \ + const unpacket_traits::type* from, Index stride) { \ + constexpr int size = unpacket_traits::size; \ + unpacket_traits::type arr[size]; \ + for (int i = 0; i < size; ++i) { \ + arr[i] = from[i * stride]; \ + } \ + return *reinterpret_cast(arr); \ + } + +EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet16f) +EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8d) +EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet16i) +EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8l) +#undef EIGEN_CLANG_PACKET_SCATTER_GATHER + +// ---- Various operations that depend on __builtin_shufflevector. +#if __has_builtin(__builtin_shufflevector) +namespace detail { +template +EIGEN_STRONG_INLINE Packet preverse_impl_8(const Packet& a) { + return __builtin_shufflevector(a, a, 7, 6, 5, 4, 3, 2, 1, 0); +} +template +EIGEN_STRONG_INLINE Packet preverse_impl_16(const Packet& a) { + return __builtin_shufflevector(a, a, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); +} +} // namespace detail + +#define EIGEN_CLANG_PACKET_REVERSE(PACKET_TYPE, SIZE) \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE preverse(const PACKET_TYPE& a) { \ + return detail::preverse_impl_##SIZE(a); \ + } + +EIGEN_CLANG_PACKET_REVERSE(Packet16f, 16) +EIGEN_CLANG_PACKET_REVERSE(Packet8d, 8) +EIGEN_CLANG_PACKET_REVERSE(Packet16i, 16) +EIGEN_CLANG_PACKET_REVERSE(Packet8l, 8) +#undef EIGEN_CLANG_PACKET_REVERSE + +namespace detail { +template +EIGEN_STRONG_INLINE Packet ploaddup16(const typename unpacket_traits::type* from) { + static_assert((unpacket_traits::size) % 2 == 0, "Packet size must be a multiple of 2"); + using HalfPacket = HalfPacket; + HalfPacket a = load_vector_unaligned(from); + return __builtin_shufflevector(a, a, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7); +} + +template +EIGEN_STRONG_INLINE Packet ploadquad16(const typename unpacket_traits::type* from) { + static_assert((unpacket_traits::size) % 4 == 0, "Packet size must be a multiple of 4"); + using QuarterPacket = QuarterPacket; + QuarterPacket a = load_vector_unaligned(from); + return __builtin_shufflevector(a, a, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3); +} + +template +EIGEN_STRONG_INLINE Packet ploaddup8(const typename unpacket_traits::type* from) { + static_assert((unpacket_traits::size) % 2 == 0, "Packet size must be a multiple of 2"); + using HalfPacket = HalfPacket; + HalfPacket a = load_vector_unaligned(from); + return __builtin_shufflevector(a, a, 0, 0, 1, 1, 2, 2, 3, 3); +} + +template +EIGEN_STRONG_INLINE Packet ploadquad8(const typename unpacket_traits::type* from) { + static_assert((unpacket_traits::size) % 4 == 0, "Packet size must be a multiple of 4"); + using QuarterPacket = QuarterPacket; + QuarterPacket a = load_vector_unaligned(from); + return __builtin_shufflevector(a, a, 0, 0, 0, 0, 1, 1, 1, 1); +} + +} // namespace detail + +template <> +EIGEN_STRONG_INLINE Packet16f ploaddup(const float* from) { + return detail::ploaddup16(from); +} +template <> +EIGEN_STRONG_INLINE Packet8d ploaddup(const double* from) { + return detail::ploaddup8(from); +} +template <> +EIGEN_STRONG_INLINE Packet16i ploaddup(const int32_t* from) { + return detail::ploaddup16(from); +} +template <> +EIGEN_STRONG_INLINE Packet8l ploaddup(const int64_t* from) { + return detail::ploaddup8(from); +} + +template <> +EIGEN_STRONG_INLINE Packet16f ploadquad(const float* from) { + return detail::ploadquad16(from); +} +template <> +EIGEN_STRONG_INLINE Packet8d ploadquad(const double* from) { + return detail::ploadquad8(from); +} +template <> +EIGEN_STRONG_INLINE Packet16i ploadquad(const int32_t* from) { + return detail::ploadquad16(from); +} +template <> +EIGEN_STRONG_INLINE Packet8l ploadquad(const int64_t* from) { + return detail::ploadquad8(from); +} + +template <> +EIGEN_STRONG_INLINE Packet16f plset(const float& a) { + Packet16f x{a + 0.0f, a + 1.0f, a + 2.0f, a + 3.0f, a + 4.0f, a + 5.0f, a + 6.0f, a + 7.0f, + a + 8.0f, a + 9.0f, a + 10.0f, a + 11.0f, a + 12.0f, a + 13.0f, a + 14.0f, a + 15.0f}; + return x; +} +template <> +EIGEN_STRONG_INLINE Packet8d plset(const double& a) { + return Packet8d{a + 0.0, a + 1.0, a + 2.0, a + 3.0, a + 4.0, a + 5.0, a + 6.0, a + 7.0}; +} +template <> +EIGEN_STRONG_INLINE Packet16i plset(const int32_t& a) { + return Packet16i{a + 0, a + 1, a + 2, a + 3, a + 4, a + 5, a + 6, a + 7, + a + 8, a + 9, a + 10, a + 11, a + 12, a + 13, a + 14, a + 15}; +} +template <> +EIGEN_STRONG_INLINE Packet8l plset(const int64_t& a) { + return Packet8l{a + 0, a + 1, a + 2, a + 3, a + 4, a + 5, a + 6, a + 7}; +} + +// Helpers for ptranspose. +namespace detail { + +template +EIGEN_ALWAYS_INLINE void zip_in_place16(Packet& p1, Packet& p2) { + Packet tmp = __builtin_shufflevector(p1, p2, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + p2 = __builtin_shufflevector(p1, p2, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); + p1 = tmp; +} + +template +EIGEN_ALWAYS_INLINE void zip_in_place8(Packet& p1, Packet& p2) { + Packet tmp = __builtin_shufflevector(p1, p2, 0, 8, 1, 9, 2, 10, 3, 11); + p2 = __builtin_shufflevector(p1, p2, 4, 12, 5, 13, 6, 14, 7, 15); + p1 = tmp; +} + +template +void zip_in_place(Packet& p1, Packet& p2); + +template <> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet16f& p1, Packet16f& p2) { + zip_in_place16(p1, p2); +} + +template <> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet8d& p1, Packet8d& p2) { + zip_in_place8(p1, p2); +} + +template <> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet16i& p1, Packet16i& p2) { + zip_in_place16(p1, p2); +} + +template <> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet8l& p1, Packet8l& p2) { + zip_in_place8(p1, p2); +} + +template +EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { + zip_in_place(kernel.packet[0], kernel.packet[1]); +} + +template +EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { + zip_in_place(kernel.packet[0], kernel.packet[2]); + zip_in_place(kernel.packet[1], kernel.packet[3]); + zip_in_place(kernel.packet[0], kernel.packet[1]); + zip_in_place(kernel.packet[2], kernel.packet[3]); +} + +template +EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { + zip_in_place(kernel.packet[0], kernel.packet[4]); + zip_in_place(kernel.packet[1], kernel.packet[5]); + zip_in_place(kernel.packet[2], kernel.packet[6]); + zip_in_place(kernel.packet[3], kernel.packet[7]); + + zip_in_place(kernel.packet[0], kernel.packet[2]); + zip_in_place(kernel.packet[1], kernel.packet[3]); + zip_in_place(kernel.packet[4], kernel.packet[6]); + zip_in_place(kernel.packet[5], kernel.packet[7]); + + zip_in_place(kernel.packet[0], kernel.packet[1]); + zip_in_place(kernel.packet[2], kernel.packet[3]); + zip_in_place(kernel.packet[4], kernel.packet[5]); + zip_in_place(kernel.packet[6], kernel.packet[7]); +} + +template +EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { + EIGEN_UNROLL_LOOP + for (int i = 0; i < 4; ++i) { + const int m = (1 << i); + EIGEN_UNROLL_LOOP + for (int j = 0; j < m; ++j) { + const int n = (1 << (3 - i)); + EIGEN_UNROLL_LOOP + for (int k = 0; k < n; ++k) { + const int idx = 2 * j * n + k; + zip_in_place(kernel.packet[idx], kernel.packet[idx + n]); + } + } + } +} + +} // namespace detail + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +#endif + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_PACKET_MATH_CLANG_H diff --git a/Eigen/src/Core/arch/clang/Reductions.h b/Eigen/src/Core/arch/clang/Reductions.h new file mode 100644 index 000000000..8454a9a85 --- /dev/null +++ b/Eigen/src/Core/arch/clang/Reductions.h @@ -0,0 +1,112 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2025 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_REDUCTIONS_CLANG_H +#define EIGEN_REDUCTIONS_CLANG_H + +namespace Eigen { +namespace internal { + +// --- Reductions --- +#if __has_builtin(__builtin_reduce_min) && __has_builtin(__builtin_reduce_max) && __has_builtin(__builtin_reduce_or) +#define EIGEN_CLANG_PACKET_REDUX_MINMAX(PACKET_TYPE) \ + template <> \ + EIGEN_STRONG_INLINE unpacket_traits::type predux_min(const PACKET_TYPE& a) { \ + return __builtin_reduce_min(a); \ + } \ + template <> \ + EIGEN_STRONG_INLINE unpacket_traits::type predux_max(const PACKET_TYPE& a) { \ + return __builtin_reduce_max(a); \ + } \ + template <> \ + EIGEN_STRONG_INLINE bool predux_any(const PACKET_TYPE& a) { \ + return __builtin_reduce_or(a != 0) != 0; \ + } + +EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet16f) +EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet8d) +EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet16i) +EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet8l) +#undef EIGEN_CLANG_PACKET_REDUX_MINMAX +#endif + +#if __has_builtin(__builtin_reduce_add) && __has_builtin(__builtin_reduce_mul) +#define EIGEN_CLANG_PACKET_REDUX_INT(PACKET_TYPE) \ + template <> \ + EIGEN_STRONG_INLINE unpacket_traits::type predux(const PACKET_TYPE& a) { \ + return __builtin_reduce_add(a); \ + } \ + template <> \ + EIGEN_STRONG_INLINE unpacket_traits::type predux_mul(const PACKET_TYPE& a) { \ + return __builtin_reduce_mul(a); \ + } + +// __builtin_reduce_{mul,add} are only defined for integer types. +EIGEN_CLANG_PACKET_REDUX_INT(Packet16i) +EIGEN_CLANG_PACKET_REDUX_INT(Packet8l) +#undef EIGEN_CLANG_PACKET_REDUX_INT +#endif + +#if __has_builtin(__builtin_shufflevector) +namespace detail { +template +EIGEN_STRONG_INLINE scalar_type_of_vector_t ReduceAdd16(const VectorT& a) { + auto t1 = __builtin_shufflevector(a, a, 0, 2, 4, 6, 8, 10, 12, 14) + + __builtin_shufflevector(a, a, 1, 3, 5, 7, 9, 11, 13, 15); + auto t2 = __builtin_shufflevector(t1, t1, 0, 2, 4, 6) + __builtin_shufflevector(t1, t1, 1, 3, 5, 7); + auto t3 = __builtin_shufflevector(t2, t2, 0, 2) + __builtin_shufflevector(t2, t2, 1, 3); + return t3[0] + t3[1]; +} + +template +EIGEN_STRONG_INLINE scalar_type_of_vector_t ReduceAdd8(const VectorT& a) { + auto t1 = __builtin_shufflevector(a, a, 0, 2, 4, 6) + __builtin_shufflevector(a, a, 1, 3, 5, 7); + auto t2 = __builtin_shufflevector(t1, t1, 0, 2) + __builtin_shufflevector(t1, t1, 1, 3); + return t2[0] + t2[1]; +} + +template +EIGEN_STRONG_INLINE scalar_type_of_vector_t ReduceMul16(const VectorT& a) { + auto t1 = __builtin_shufflevector(a, a, 0, 2, 4, 6, 8, 10, 12, 14) * + __builtin_shufflevector(a, a, 1, 3, 5, 7, 9, 11, 13, 15); + auto t2 = __builtin_shufflevector(t1, t1, 0, 2, 4, 6) * __builtin_shufflevector(t1, t1, 1, 3, 5, 7); + auto t3 = __builtin_shufflevector(t2, t2, 0, 2) * __builtin_shufflevector(t2, t2, 1, 3); + return t3[0] * t3[1]; +} + +template +EIGEN_STRONG_INLINE scalar_type_of_vector_t ReduceMul8(const VectorT& a) { + auto t1 = __builtin_shufflevector(a, a, 0, 2, 4, 6) * __builtin_shufflevector(a, a, 1, 3, 5, 7); + auto t2 = __builtin_shufflevector(t1, t1, 0, 2) * __builtin_shufflevector(t1, t1, 1, 3); + return t2[0] * t2[1]; +} +} // namespace detail + +template <> +EIGEN_STRONG_INLINE float predux(const Packet16f& a) { + return detail::ReduceAdd16(a); +} +template <> +EIGEN_STRONG_INLINE double predux(const Packet8d& a) { + return detail::ReduceAdd8(a); +} +template <> +EIGEN_STRONG_INLINE float predux_mul(const Packet16f& a) { + return detail::ReduceMul16(a); +} +template <> +EIGEN_STRONG_INLINE double predux_mul(const Packet8d& a) { + return detail::ReduceMul8(a); +} +#endif + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_REDUCTIONS_CLANG_H diff --git a/Eigen/src/Core/arch/clang/TypeCasting.h b/Eigen/src/Core/arch/clang/TypeCasting.h new file mode 100644 index 000000000..0aa44963f --- /dev/null +++ b/Eigen/src/Core/arch/clang/TypeCasting.h @@ -0,0 +1,63 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2025 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_CLANG_H +#define EIGEN_TYPE_CASTING_CLANG_H + +namespace Eigen { +namespace internal { + +//============================================================================== +// preinterpret +//============================================================================== +template <> +EIGEN_STRONG_INLINE Packet16f preinterpret(const Packet16i& a) { + return reinterpret_cast(a); +} +template <> +EIGEN_STRONG_INLINE Packet16i preinterpret(const Packet16f& a) { + return reinterpret_cast(a); +} + +template <> +EIGEN_STRONG_INLINE Packet8d preinterpret(const Packet8l& a) { + return reinterpret_cast(a); +} +template <> +EIGEN_STRONG_INLINE Packet8l preinterpret(const Packet8d& a) { + return reinterpret_cast(a); +} + +//============================================================================== +// pcast +//============================================================================== +#if __has_builtin(__builtin_convertvector) +template <> +EIGEN_STRONG_INLINE Packet16i pcast(const Packet16f& a) { + return __builtin_convertvector(a, Packet16i); +} +template <> +EIGEN_STRONG_INLINE Packet16f pcast(const Packet16i& a) { + return __builtin_convertvector(a, Packet16f); +} + +template <> +EIGEN_STRONG_INLINE Packet8l pcast(const Packet8d& a) { + return __builtin_convertvector(a, Packet8l); +} +template <> +EIGEN_STRONG_INLINE Packet8d pcast(const Packet8l& a) { + return __builtin_convertvector(a, Packet8d); +} +#endif + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_CLANG_H diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index e72c6b48e..7d308382e 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -642,25 +642,21 @@ DoublePacket padd(const DoublePacket& a, const DoublePacket the "4" in "downto4" -// corresponds to the number of complexes, so it means "8" -// it terms of real coefficients. - template -const DoublePacket& predux_half_dowto4(const DoublePacket& a, - std::enable_if_t::size <= 8>* = 0) { +const DoublePacket& predux_half(const DoublePacket& a, + std::enable_if_t::size <= 8>* = 0) { return a; } template -DoublePacket::half> predux_half_dowto4( +DoublePacket::half> predux_half( const DoublePacket& a, std::enable_if_t::size == 16>* = 0) { // yes, that's pretty hackish :( DoublePacket::half> res; typedef std::complex::type> Cplx; typedef typename packet_traits::type CplxPacket; - res.first = predux_half_dowto4(CplxPacket(a.first)).v; - res.second = predux_half_dowto4(CplxPacket(a.second)).v; + res.first = predux_half(CplxPacket(a.first)).v; + res.second = predux_half(CplxPacket(a.second)).v; return res; } @@ -1067,7 +1063,7 @@ struct last_row_process_16_packets 0) { // We have to handle the last row(s) of the rhs, which // correspond to a half-packet - SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0)); + SAccPacketQuarter c0 = predux_half(predux_half(C0)); for (Index kk = endk; kk < depth; kk++) { SLhsPacketQuarter a0; @@ -1080,7 +1076,7 @@ struct last_row_process_16_packets); straits.acc(c0, alphav, R); } else { - straits.acc(predux_half_dowto4(C0), alphav, R); + straits.acc(predux_half(C0), alphav, R); } res.scatterPacket(i, j2, R); } else if (SwappedTraits::LhsProgress == 16) { diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index c2546a083..ae8aed9e8 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -11,6 +11,18 @@ #ifndef EIGEN_CONFIGURE_VECTORIZATION_H #define EIGEN_CONFIGURE_VECTORIZATION_H +// Prepare for using the generic clang backend if requested. +#if defined(EIGEN_VECTORIZE_GENERIC) && !defined(EIGEN_DONT_VECTORIZE) && !defined(EIGEN_DONT_ALIGN) +#if !EIGEN_ARCH_VECTOR_EXTENSIONS +#error "The compiler does not support clang vector extensions." +#endif +#define EIGEN_VECTORIZE +#ifndef EIGEN_GENERIC_VECTOR_SIZE_BYTES +#define EIGEN_GENERIC_VECTOR_SIZE_BYTES 64 +#endif +#define EIGEN_MAX_ALIGN_BYTES EIGEN_GENERIC_VECTOR_SIZE_BYTES +#endif + //------------------------------------------------------------------------------------------ // Static and dynamic alignment control // @@ -504,7 +516,7 @@ extern "C" { namespace Eigen { -inline static const char *SimdInstructionSetsInUse(void) { +inline static const char* SimdInstructionSetsInUse(void) { #if defined(EIGEN_VECTORIZE_AVX512) return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; #elif defined(EIGEN_VECTORIZE_AVX) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index a2c0d9479..2b5697aca 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -830,6 +830,13 @@ #endif #endif +// Does the compiler support vector types? +#if __has_attribute(ext_vector_type) && __has_builtin(__builtin_vectorelements) +#define EIGEN_ARCH_VECTOR_EXTENSIONS 1 +#else +#define EIGEN_ARCH_VECTOR_EXTENSIONS 0 +#endif + // Multidimensional subscript operator feature test #if defined(__cpp_multidimensional_subscript) && __cpp_multidimensional_subscript >= 202110L #define EIGEN_MULTIDIMENSIONAL_SUBSCRIPT diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 18574d25a..959abd97b 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -685,8 +685,8 @@ void packetmath() { int HalfPacketSize = PacketSize > 4 ? PacketSize / 2 : PacketSize; for (int i = 0; i < HalfPacketSize; ++i) ref[i] = Scalar(0); for (int i = 0; i < PacketSize; ++i) ref[i % HalfPacketSize] += data1[i]; - internal::pstore(data2, internal::predux_half_dowto4(internal::pload(data1))); - VERIFY(test::areApprox(ref, data2, HalfPacketSize) && "internal::predux_half_dowto4"); + internal::pstore(data2, internal::predux_half(internal::pload(data1))); + VERIFY(test::areApprox(ref, data2, HalfPacketSize) && "internal::predux_half"); } // Avoid overflows. diff --git a/unsupported/Eigen/SpecialFunctions b/unsupported/Eigen/SpecialFunctions index 4f7e59935..376b025fd 100644 --- a/unsupported/Eigen/SpecialFunctions +++ b/unsupported/Eigen/SpecialFunctions @@ -77,6 +77,8 @@ namespace Eigen { #include "src/SpecialFunctions/SpecialFunctionsFunctors.h" #include "src/SpecialFunctions/SpecialFunctionsArrayAPI.h" +#ifndef EIGEN_VECTORIZE_GENERIC +// TODO(rmlarsen): Make these work with generic vectorization if possible. #if defined EIGEN_VECTORIZE_AVX512 #include "src/SpecialFunctions/arch/AVX/BesselFunctions.h" #include "src/SpecialFunctions/arch/AVX/SpecialFunctions.h" @@ -93,6 +95,7 @@ namespace Eigen { #if defined EIGEN_VECTORIZE_GPU #include "src/SpecialFunctions/arch/GPU/SpecialFunctions.h" #endif +#endif // IWYU pragma: end_exports namespace Eigen {