mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-01-18 17:31:19 +01:00
Add a generic Eigen backend based on clang vector extensions
The goal of this MR is to implement a generic SIMD backend (packet ops) for Eigen that uses clang vector extensions instead of platform-dependent intrinsics. Ideally, this should make it possible to build Eigen and achieve reasonable speed on any platform that has a recent clang compiler, without having to write any inline assembly or intrinsics. Caveats: * The current implementation is a proof of concept and supports vectorization for float, double, int32_t, and int64_t using fixed-size 512-bit vectors (a somewhat arbitrary choice). I have not done much to tune this for speed yet. * For now, there is no way to enable this other than setting -DEIGEN_VECTORIZE_GENERIC on the command line. * This only compiles with newer versions of clang. I have tested that it compiles and all tests pass with clang 19.1.7. https://clang.llvm.org/docs/LanguageExtensions.html#vectors-and-extended-vectors Closes #2998 and #2997 See merge request libeigen/eigen!2051 Co-authored-by: Rasmus Munk Larsen <rmlarsen@google.com> Co-authored-by: Antonio Sánchez <cantonios@google.com>
This commit is contained in:
@@ -205,6 +205,12 @@ using std::ptrdiff_t;
|
||||
#include "src/Core/arch/Default/BFloat16.h"
|
||||
#include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h"
|
||||
|
||||
#if defined(EIGEN_VECTORIZE_GENERIC) && !defined(EIGEN_DONT_VECTORIZE)
|
||||
#include "src/Core/arch/clang/PacketMath.h"
|
||||
#include "src/Core/arch/clang/TypeCasting.h"
|
||||
#include "src/Core/arch/clang/Reductions.h"
|
||||
#include "src/Core/arch/clang/MathFunctions.h"
|
||||
#else
|
||||
#if defined EIGEN_VECTORIZE_AVX512
|
||||
#include "src/Core/arch/SSE/PacketMath.h"
|
||||
#include "src/Core/arch/SSE/Reductions.h"
|
||||
@@ -297,6 +303,8 @@ using std::ptrdiff_t;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif // #ifndef EIGEN_VECTORIZE_GENERIC
|
||||
|
||||
#include "src/Core/arch/Default/Settings.h"
|
||||
// This file provides generic implementations valid for scalar as well
|
||||
#include "src/Core/arch/Default/GenericPacketMathFunctions.h"
|
||||
|
||||
@@ -48,10 +48,13 @@
|
||||
#include "src/Geometry/AlignedBox.h"
|
||||
#include "src/Geometry/Umeyama.h"
|
||||
|
||||
#ifndef EIGEN_VECTORIZE_GENERIC
|
||||
// TODO(rmlarsen): Make these work with generic vectorization if possible.
|
||||
// Use the SSE optimized version whenever possible.
|
||||
#if (defined EIGEN_VECTORIZE_SSE) || (defined EIGEN_VECTORIZE_NEON)
|
||||
#include "src/Geometry/arch/Geometry_SIMD.h"
|
||||
#endif
|
||||
#endif
|
||||
// IWYU pragma: end_exports
|
||||
|
||||
#include "src/Core/util/ReenableStupidWarnings.h"
|
||||
|
||||
3
Eigen/LU
3
Eigen/LU
@@ -36,9 +36,12 @@
|
||||
#include "src/LU/Determinant.h"
|
||||
#include "src/LU/InverseImpl.h"
|
||||
|
||||
#ifndef EIGEN_VECTORIZE_GENERIC
|
||||
// TODO(rmlarsen): Make these work with generic vectorization if possible.
|
||||
#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_NEON
|
||||
#include "src/LU/arch/InverseSize4.h"
|
||||
#endif
|
||||
#endif
|
||||
// IWYU pragma: end_exports
|
||||
|
||||
#include "src/Core/util/ReenableStupidWarnings.h"
|
||||
|
||||
@@ -1224,7 +1224,7 @@ EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type pfirst(const Pac
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC inline std::conditional_t<(unpacket_traits<Packet>::size % 8) == 0,
|
||||
typename unpacket_traits<Packet>::half, Packet>
|
||||
predux_half_dowto4(const Packet& a) {
|
||||
predux_half(const Packet& a) {
|
||||
return a;
|
||||
}
|
||||
|
||||
@@ -1342,7 +1342,7 @@ struct pmadd_impl {
|
||||
return psub(c, pmul(a, b));
|
||||
}
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pnmsub(const Packet& a, const Packet& b, const Packet& c) {
|
||||
return pnegate(pmadd(a, b, c));
|
||||
return pnegate(padd(pmul(a, b), c));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1476,8 +1476,8 @@ struct PacketBlock {
|
||||
Packet packet[N];
|
||||
};
|
||||
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet, 1>& /*kernel*/) {
|
||||
template <typename Packet, int size = 1>
|
||||
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet, size>& /*kernel*/) {
|
||||
// Nothing to do in the scalar case, i.e. a 1x1 matrix.
|
||||
}
|
||||
|
||||
|
||||
@@ -1934,15 +1934,15 @@ EIGEN_STRONG_INLINE Packet4d pldexp_fast<Packet4d>(const Packet4d& a, const Pack
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4f predux_half_dowto4<Packet8f>(const Packet8f& a) {
|
||||
EIGEN_STRONG_INLINE Packet4f predux_half<Packet8f>(const Packet8f& a) {
|
||||
return _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i predux_half_dowto4<Packet8i>(const Packet8i& a) {
|
||||
EIGEN_STRONG_INLINE Packet4i predux_half<Packet8i>(const Packet8i& a) {
|
||||
return _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4ui predux_half_dowto4<Packet8ui>(const Packet8ui& a) {
|
||||
EIGEN_STRONG_INLINE Packet4ui predux_half<Packet8ui>(const Packet8ui& a) {
|
||||
return _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
|
||||
}
|
||||
|
||||
|
||||
@@ -1498,7 +1498,7 @@ EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, const Packet8d&
|
||||
#endif
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8f predux_half_dowto4<Packet16f>(const Packet16f& a) {
|
||||
EIGEN_STRONG_INLINE Packet8f predux_half<Packet16f>(const Packet16f& a) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
__m256 lane0 = _mm512_extractf32x8_ps(a, 0);
|
||||
__m256 lane1 = _mm512_extractf32x8_ps(a, 1);
|
||||
@@ -1514,13 +1514,13 @@ EIGEN_STRONG_INLINE Packet8f predux_half_dowto4<Packet16f>(const Packet16f& a) {
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4d predux_half_dowto4<Packet8d>(const Packet8d& a) {
|
||||
EIGEN_STRONG_INLINE Packet4d predux_half<Packet8d>(const Packet8d& a) {
|
||||
__m256d lane0 = _mm512_extractf64x4_pd(a, 0);
|
||||
__m256d lane1 = _mm512_extractf64x4_pd(a, 1);
|
||||
return _mm256_add_pd(lane0, lane1);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8i predux_half_dowto4<Packet16i>(const Packet16i& a) {
|
||||
EIGEN_STRONG_INLINE Packet8i predux_half<Packet16i>(const Packet16i& a) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
__m256i lane0 = _mm512_extracti32x8_epi32(a, 0);
|
||||
__m256i lane1 = _mm512_extracti32x8_epi32(a, 1);
|
||||
@@ -1537,7 +1537,7 @@ EIGEN_STRONG_INLINE Packet8i predux_half_dowto4<Packet16i>(const Packet16i& a) {
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4l predux_half_dowto4<Packet8l>(const Packet8l& a) {
|
||||
EIGEN_STRONG_INLINE Packet4l predux_half<Packet8l>(const Packet8l& a) {
|
||||
__m256i lane0 = _mm512_extracti64x4_epi64(a, 0);
|
||||
__m256i lane1 = _mm512_extracti64x4_epi64(a, 1);
|
||||
return _mm256_add_epi64(lane0, lane1);
|
||||
@@ -2285,7 +2285,7 @@ EIGEN_STRONG_INLINE Packet16h pnmsub<Packet16h>(const Packet16h& a, const Packet
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
|
||||
EIGEN_STRONG_INLINE Packet8h predux_half<Packet16h>(const Packet16h& a) {
|
||||
Packet8h lane0 = _mm256_extractf128_si256(a, 0);
|
||||
Packet8h lane1 = _mm256_extractf128_si256(a, 1);
|
||||
return padd<Packet8h>(lane0, lane1);
|
||||
@@ -2791,7 +2791,7 @@ EIGEN_STRONG_INLINE Packet16bf plset<Packet16bf>(const bfloat16& a) {
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8bf predux_half_dowto4<Packet16bf>(const Packet16bf& a) {
|
||||
EIGEN_STRONG_INLINE Packet8bf predux_half<Packet16bf>(const Packet16bf& a) {
|
||||
Packet8bf lane0 = _mm256_extractf128_si256(a, 0);
|
||||
Packet8bf lane1 = _mm256_extractf128_si256(a, 1);
|
||||
return padd<Packet8bf>(lane0, lane1);
|
||||
|
||||
@@ -695,7 +695,6 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Pack
|
||||
|
||||
template <typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_double(const Packet _x) {
|
||||
Packet x = _x;
|
||||
const Packet cst_zero = pset1<Packet>(0.0);
|
||||
const Packet cst_1 = pset1<Packet>(1.0);
|
||||
const Packet cst_2 = pset1<Packet>(2.0);
|
||||
@@ -719,7 +718,8 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_double(const Pac
|
||||
|
||||
// clamp x
|
||||
Packet zero_mask = pcmp_lt(_x, cst_exp_lo);
|
||||
x = pmin(x, cst_exp_hi);
|
||||
Packet x = pmin(_x, cst_exp_hi);
|
||||
|
||||
// Express exp(x) as exp(g + n*log(2)).
|
||||
fx = pmadd(cst_cephes_LOG2EF, x, cst_half);
|
||||
|
||||
@@ -1352,7 +1352,6 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_double(const T& a_x)
|
||||
const T minus_clamp = pset1<T>(-17.714196154005176);
|
||||
#endif
|
||||
const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
|
||||
|
||||
// The following rational approximation was generated by rminimax
|
||||
// (https://gitlab.inria.fr/sfilip/rminimax) using the following
|
||||
// command:
|
||||
|
||||
@@ -3500,27 +3500,27 @@ EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a) {
|
||||
#endif
|
||||
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c& a) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half(const Packet8c& a) {
|
||||
return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a, vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0);
|
||||
}
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half_dowto4(const Packet16c& a) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half(const Packet16c& a) {
|
||||
return vadd_s8(vget_high_s8(a), vget_low_s8(a));
|
||||
}
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half_dowto4(const Packet8uc& a) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half(const Packet8uc& a) {
|
||||
return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a, vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0);
|
||||
}
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half_dowto4(const Packet16uc& a) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half(const Packet16uc& a) {
|
||||
return vadd_u8(vget_high_u8(a), vget_low_u8(a));
|
||||
}
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half_dowto4(const Packet8s& a) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half(const Packet8s& a) {
|
||||
return vadd_s16(vget_high_s16(a), vget_low_s16(a));
|
||||
}
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(const Packet8us& a) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half(const Packet8us& a) {
|
||||
return vadd_u16(vget_high_u16(a), vget_low_u16(a));
|
||||
}
|
||||
|
||||
@@ -5401,7 +5401,7 @@ struct unpacket_traits<Packet8hf> : neon_unpacket_default<Packet8hf, half> {
|
||||
};
|
||||
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half_dowto4<Packet8hf>(const Packet8hf& a) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half<Packet8hf>(const Packet8hf& a) {
|
||||
return vadd_f16(vget_low_f16(a), vget_high_f16(a));
|
||||
}
|
||||
|
||||
|
||||
47
Eigen/src/Core/arch/clang/MathFunctions.h
Normal file
47
Eigen/src/Core/arch/clang/MathFunctions.h
Normal file
@@ -0,0 +1,47 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2025 Rasmus Munk Larsen
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_MATH_FUNCTIONS_CLANG_H
|
||||
#define EIGEN_MATH_FUNCTIONS_CLANG_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "../../InternalHeaderCheck.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pfrexp<Packet16f>(const Packet16f& a, Packet16f& exponent) {
|
||||
return pfrexp_generic(a, exponent);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d pfrexp<Packet8d>(const Packet8d& a, Packet8d& exponent) {
|
||||
return pfrexp_generic(a, exponent);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pldexp<Packet16f>(const Packet16f& a, const Packet16f& exponent) {
|
||||
return pldexp_generic(a, exponent);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, const Packet8d& exponent) {
|
||||
return pldexp_generic(a, exponent);
|
||||
}
|
||||
|
||||
EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet16f)
|
||||
EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(Packet8d)
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_MATH_FUNCTIONS_CLANG_H
|
||||
869
Eigen/src/Core/arch/clang/PacketMath.h
Normal file
869
Eigen/src/Core/arch/clang/PacketMath.h
Normal file
@@ -0,0 +1,869 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2025 Rasmus Munk Larsen
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_PACKET_MATH_CLANG_H
|
||||
#define EIGEN_PACKET_MATH_CLANG_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
namespace detail {
|
||||
// namespace detail contains implementation details specific to this
|
||||
// file, while namespace internal contains internal APIs used elsewhere
|
||||
// in Eigen.
|
||||
template <typename ScalarT, int n>
|
||||
using VectorType = ScalarT __attribute__((ext_vector_type(n), aligned(n * sizeof(ScalarT))));
|
||||
} // namespace detail
|
||||
|
||||
// --- Primary packet type definitions (fixed at 64 bytes) ---
|
||||
|
||||
// TODO(rmlarsen): Generalize to other vector sizes.
|
||||
static_assert(EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64, "We currently assume the full vector size is 64 bytes");
|
||||
using Packet16f = detail::VectorType<float, 16>;
|
||||
using Packet8d = detail::VectorType<double, 8>;
|
||||
using Packet16i = detail::VectorType<int32_t, 16>;
|
||||
using Packet8l = detail::VectorType<int64_t, 8>;
|
||||
|
||||
// --- packet_traits specializations ---
|
||||
template <>
|
||||
struct packet_traits<float> : default_packet_traits {
|
||||
using type = Packet16f;
|
||||
using half = Packet16f;
|
||||
enum {
|
||||
Vectorizable = 1,
|
||||
size = 16,
|
||||
AlignedOnScalar = 1,
|
||||
HasAdd = 1,
|
||||
HasSub = 1,
|
||||
HasMul = 1,
|
||||
HasDiv = 1,
|
||||
HasNegate = 1,
|
||||
HasAbs = 1,
|
||||
HasRound = 1,
|
||||
HasMinMax = 1,
|
||||
HasCmp = 1,
|
||||
HasSet1 = 1,
|
||||
HasCast = 1,
|
||||
HasBitwise = 1,
|
||||
HasRedux = 1,
|
||||
HasSign = 1,
|
||||
HasArg = 0,
|
||||
HasConj = 1,
|
||||
// Math functions
|
||||
HasReciprocal = 1,
|
||||
HasSin = 1,
|
||||
HasCos = 1,
|
||||
HasACos = 1,
|
||||
HasASin = 1,
|
||||
HasATan = 1,
|
||||
HasATanh = 1,
|
||||
HasLog = 1,
|
||||
HasLog1p = 1,
|
||||
HasExpm1 = 1,
|
||||
HasExp = 1,
|
||||
HasPow = 1,
|
||||
HasNdtri = 1,
|
||||
HasBessel = 1,
|
||||
HasSqrt = 1,
|
||||
HasRsqrt = 1,
|
||||
HasCbrt = 1,
|
||||
HasTanh = 1,
|
||||
HasErf = 1,
|
||||
HasErfc = 1
|
||||
};
|
||||
};
|
||||
|
||||
template <>
|
||||
struct packet_traits<double> : default_packet_traits {
|
||||
using type = Packet8d;
|
||||
using half = Packet8d;
|
||||
enum {
|
||||
Vectorizable = 1,
|
||||
size = 8,
|
||||
AlignedOnScalar = 1,
|
||||
HasAdd = 1,
|
||||
HasSub = 1,
|
||||
HasMul = 1,
|
||||
HasDiv = 1,
|
||||
HasNegate = 1,
|
||||
HasAbs = 1,
|
||||
HasRound = 1,
|
||||
HasMinMax = 1,
|
||||
HasCmp = 1,
|
||||
HasSet1 = 1,
|
||||
HasCast = 1,
|
||||
HasBitwise = 1,
|
||||
HasRedux = 1,
|
||||
HasSign = 1,
|
||||
HasArg = 0,
|
||||
HasConj = 1,
|
||||
// Math functions
|
||||
HasReciprocal = 1,
|
||||
HasSin = 1,
|
||||
HasCos = 1,
|
||||
HasACos = 0,
|
||||
HasASin = 0,
|
||||
HasATan = 1,
|
||||
HasATanh = 1,
|
||||
HasLog = 1,
|
||||
HasLog1p = 1,
|
||||
HasExpm1 = 1,
|
||||
HasExp = 1,
|
||||
HasPow = 1,
|
||||
HasNdtri = 1,
|
||||
HasBessel = 1,
|
||||
HasSqrt = 1,
|
||||
HasRsqrt = 1,
|
||||
HasCbrt = 1,
|
||||
HasTanh = 1,
|
||||
HasErf = 1,
|
||||
HasErfc = 1
|
||||
};
|
||||
};
|
||||
|
||||
template <>
|
||||
struct packet_traits<int32_t> : default_packet_traits {
|
||||
using type = Packet16i;
|
||||
using half = Packet16i;
|
||||
enum {
|
||||
Vectorizable = 1,
|
||||
size = 16,
|
||||
AlignedOnScalar = 1,
|
||||
HasAdd = 1,
|
||||
HasSub = 1,
|
||||
HasMul = 1,
|
||||
HasDiv = 1,
|
||||
HasNegate = 1,
|
||||
HasAbs = 1,
|
||||
HasMinMax = 1,
|
||||
HasCmp = 1,
|
||||
HasSet1 = 1,
|
||||
HasCast = 1,
|
||||
HasBitwise = 1,
|
||||
HasRedux = 1,
|
||||
// Set remaining to 0
|
||||
HasRound = 1,
|
||||
HasSqrt = 0,
|
||||
HasRsqrt = 0,
|
||||
HasReciprocal = 0,
|
||||
HasArg = 0,
|
||||
HasConj = 1,
|
||||
HasExp = 0,
|
||||
HasLog = 0,
|
||||
HasSin = 0,
|
||||
HasCos = 0,
|
||||
};
|
||||
};
|
||||
|
||||
template <>
|
||||
struct packet_traits<int64_t> : default_packet_traits {
|
||||
using type = Packet8l;
|
||||
using half = Packet8l;
|
||||
enum {
|
||||
Vectorizable = 1,
|
||||
size = 8,
|
||||
AlignedOnScalar = 1,
|
||||
HasAdd = 1,
|
||||
HasSub = 1,
|
||||
HasMul = 1,
|
||||
HasDiv = 1,
|
||||
HasNegate = 1,
|
||||
HasAbs = 1,
|
||||
HasMinMax = 1,
|
||||
HasCmp = 1,
|
||||
HasSet1 = 1,
|
||||
HasCast = 1,
|
||||
HasBitwise = 1,
|
||||
HasRedux = 1,
|
||||
// Set remaining to 0
|
||||
HasRound = 1,
|
||||
HasSqrt = 0,
|
||||
HasRsqrt = 0,
|
||||
HasReciprocal = 0,
|
||||
HasArg = 0,
|
||||
HasConj = 1,
|
||||
HasExp = 0,
|
||||
HasLog = 0,
|
||||
HasSin = 0,
|
||||
HasCos = 0,
|
||||
};
|
||||
};
|
||||
|
||||
// --- unpacket_traits specializations ---
|
||||
template <>
|
||||
struct unpacket_traits<Packet16f> {
|
||||
using type = float;
|
||||
using half = Packet16f;
|
||||
using integer_packet = Packet16i;
|
||||
enum {
|
||||
size = 16,
|
||||
alignment = EIGEN_GENERIC_VECTOR_SIZE_BYTES,
|
||||
vectorizable = true,
|
||||
masked_load_available = false,
|
||||
masked_store_available = false
|
||||
};
|
||||
};
|
||||
template <>
|
||||
struct unpacket_traits<Packet8d> {
|
||||
using type = double;
|
||||
using half = Packet8d;
|
||||
using integer_packet = Packet8l;
|
||||
enum {
|
||||
size = 8,
|
||||
alignment = EIGEN_GENERIC_VECTOR_SIZE_BYTES,
|
||||
vectorizable = true,
|
||||
masked_load_available = false,
|
||||
masked_store_available = false
|
||||
};
|
||||
};
|
||||
template <>
|
||||
struct unpacket_traits<Packet16i> {
|
||||
using type = int32_t;
|
||||
using half = Packet16i;
|
||||
enum {
|
||||
size = 16,
|
||||
alignment = EIGEN_GENERIC_VECTOR_SIZE_BYTES,
|
||||
vectorizable = true,
|
||||
masked_load_available = false,
|
||||
masked_store_available = false
|
||||
};
|
||||
};
|
||||
template <>
|
||||
struct unpacket_traits<Packet8l> {
|
||||
using type = int64_t;
|
||||
using half = Packet8l;
|
||||
enum {
|
||||
size = 8,
|
||||
alignment = EIGEN_GENERIC_VECTOR_SIZE_BYTES,
|
||||
vectorizable = true,
|
||||
masked_load_available = false,
|
||||
masked_store_available = false
|
||||
};
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
// --- vector type helpers ---
|
||||
template <typename VectorT>
|
||||
struct ScalarTypeOfVector {
|
||||
using type = std::remove_all_extents_t<std::remove_reference_t<decltype(VectorT()[0])>>;
|
||||
};
|
||||
|
||||
template <typename VectorT>
|
||||
using scalar_type_of_vector_t = typename ScalarTypeOfVector<VectorT>::type;
|
||||
|
||||
template <typename VectorType>
|
||||
struct UnsignedVectorHelpter {
|
||||
static VectorType v;
|
||||
static constexpr int n = __builtin_vectorelements(v);
|
||||
using UnsignedScalar = std::make_unsigned_t<scalar_type_of_vector_t<VectorType>>;
|
||||
using type = UnsignedScalar __attribute__((ext_vector_type(n), aligned(n * sizeof(UnsignedScalar))));
|
||||
};
|
||||
|
||||
template <typename VectorT>
|
||||
using unsigned_vector_t = typename UnsignedVectorHelpter<VectorT>::type;
|
||||
|
||||
template <typename VectorT>
|
||||
using HalfPacket = VectorType<typename unpacket_traits<VectorT>::type, unpacket_traits<VectorT>::size / 2>;
|
||||
|
||||
template <typename VectorT>
|
||||
using QuarterPacket = VectorType<typename unpacket_traits<VectorT>::type, unpacket_traits<VectorT>::size / 4>;
|
||||
|
||||
// load and store helpers.
|
||||
template <typename VectorT>
|
||||
EIGEN_STRONG_INLINE VectorT load_vector_unaligned(const scalar_type_of_vector_t<VectorT>* from) {
|
||||
VectorT to;
|
||||
constexpr int n = __builtin_vectorelements(to);
|
||||
for (int i = 0; i < n; ++i) {
|
||||
to[i] = from[i];
|
||||
}
|
||||
return to;
|
||||
}
|
||||
|
||||
template <typename VectorT>
|
||||
EIGEN_STRONG_INLINE VectorT load_vector_aligned(const scalar_type_of_vector_t<VectorT>* from) {
|
||||
return *reinterpret_cast<const VectorT*>(assume_aligned<EIGEN_GENERIC_VECTOR_SIZE_BYTES>(from));
|
||||
}
|
||||
|
||||
template <typename VectorT>
|
||||
EIGEN_STRONG_INLINE void store_vector_unaligned(scalar_type_of_vector_t<VectorT>* to, const VectorT& from) {
|
||||
constexpr int n = __builtin_vectorelements(from);
|
||||
for (int i = 0; i < n; ++i) {
|
||||
*to++ = from[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename VectorT>
|
||||
EIGEN_STRONG_INLINE void store_vector_aligned(scalar_type_of_vector_t<VectorT>* to, const VectorT& from) {
|
||||
*reinterpret_cast<VectorT*>(assume_aligned<EIGEN_GENERIC_VECTOR_SIZE_BYTES>(to)) = from;
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// --- Intrinsic-like specializations ---
|
||||
|
||||
// --- Load/Store operations ---
|
||||
#define EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PACKET_TYPE, SCALAR_TYPE) \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE ploadu<PACKET_TYPE>(const SCALAR_TYPE* from) { \
|
||||
return detail::load_vector_unaligned<PACKET_TYPE>(from); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pload<PACKET_TYPE>(const SCALAR_TYPE* from) { \
|
||||
return detail::load_vector_aligned<PACKET_TYPE>(from); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE void pstoreu<SCALAR_TYPE, PACKET_TYPE>(SCALAR_TYPE * to, const PACKET_TYPE& from) { \
|
||||
detail::store_vector_unaligned<PACKET_TYPE>(to, from); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE void pstore<SCALAR_TYPE, PACKET_TYPE>(SCALAR_TYPE * to, const PACKET_TYPE& from) { \
|
||||
detail::store_vector_aligned<PACKET_TYPE>(to, from); \
|
||||
}
|
||||
|
||||
EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet16f, float)
|
||||
EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet8d, double)
|
||||
EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet16i, int32_t)
|
||||
EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet8l, int64_t)
|
||||
#undef EIGEN_CLANG_PACKET_LOAD_STORE_PACKET
|
||||
|
||||
// --- Broadcast operation ---
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pset1frombits<Packet16f>(uint32_t from) {
|
||||
return Packet16f(numext::bit_cast<float>(from));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d pset1frombits<Packet8d>(uint64_t from) {
|
||||
return Packet8d(numext::bit_cast<double>(from));
|
||||
}
|
||||
|
||||
#define EIGEN_CLANG_PACKET_SET1(PACKET_TYPE) \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pset1<PACKET_TYPE>(const unpacket_traits<PACKET_TYPE>::type& from) { \
|
||||
return PACKET_TYPE(from); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE unpacket_traits<PACKET_TYPE>::type pfirst<PACKET_TYPE>(const PACKET_TYPE& from) { \
|
||||
return from[0]; \
|
||||
}
|
||||
|
||||
EIGEN_CLANG_PACKET_SET1(Packet16f)
|
||||
EIGEN_CLANG_PACKET_SET1(Packet8d)
|
||||
EIGEN_CLANG_PACKET_SET1(Packet16i)
|
||||
EIGEN_CLANG_PACKET_SET1(Packet8l)
|
||||
#undef EIGEN_CLANG_PACKET_SET1
|
||||
|
||||
// --- Arithmetic operations ---
|
||||
#define EIGEN_CLANG_PACKET_ARITHMETIC(PACKET_TYPE) \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pisnan<PACKET_TYPE>(const PACKET_TYPE& a) { \
|
||||
return reinterpret_cast<PACKET_TYPE>(a != a); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pnegate<PACKET_TYPE>(const PACKET_TYPE& a) { \
|
||||
return -a; \
|
||||
}
|
||||
|
||||
EIGEN_CLANG_PACKET_ARITHMETIC(Packet16f)
|
||||
EIGEN_CLANG_PACKET_ARITHMETIC(Packet8d)
|
||||
EIGEN_CLANG_PACKET_ARITHMETIC(Packet16i)
|
||||
EIGEN_CLANG_PACKET_ARITHMETIC(Packet8l)
|
||||
#undef EIGEN_CLANG_PACKET_ARITHMETIC
|
||||
|
||||
// --- Bitwise operations (via casting) ---
|
||||
|
||||
namespace detail {
|
||||
|
||||
// Note: pcast functions are not template specializations, just helpers
|
||||
// identical to preinterpret. We duplicate them here to avoid a circular
|
||||
// dependence with TypeCasting.h.
|
||||
EIGEN_STRONG_INLINE Packet16i pcast_float_to_int(const Packet16f& a) { return reinterpret_cast<Packet16i>(a); }
|
||||
EIGEN_STRONG_INLINE Packet16f pcast_int_to_float(const Packet16i& a) { return reinterpret_cast<Packet16f>(a); }
|
||||
EIGEN_STRONG_INLINE Packet8l pcast_double_to_long(const Packet8d& a) { return reinterpret_cast<Packet8l>(a); }
|
||||
EIGEN_STRONG_INLINE Packet8d pcast_long_to_double(const Packet8l& a) { return reinterpret_cast<Packet8d>(a); }
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// Bitwise ops for integer packets
|
||||
#define EIGEN_CLANG_PACKET_BITWISE_INT(PACKET_TYPE) \
|
||||
template <> \
|
||||
constexpr EIGEN_STRONG_INLINE PACKET_TYPE ptrue<PACKET_TYPE>(const PACKET_TYPE& /*unused*/) { \
|
||||
return PACKET_TYPE(0) == PACKET_TYPE(0); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pand<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
|
||||
return a & b; \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE por<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
|
||||
return a | b; \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pxor<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
|
||||
return a ^ b; \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pandnot<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
|
||||
return a & ~b; \
|
||||
} \
|
||||
template <int N> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE parithmetic_shift_right(const PACKET_TYPE& a) { \
|
||||
return a >> N; \
|
||||
} \
|
||||
template <int N> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE plogical_shift_right(const PACKET_TYPE& a) { \
|
||||
using UnsignedT = detail::unsigned_vector_t<PACKET_TYPE>; \
|
||||
return reinterpret_cast<PACKET_TYPE>(reinterpret_cast<UnsignedT>(a) >> N); \
|
||||
} \
|
||||
template <int N> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE plogical_shift_left(const PACKET_TYPE& a) { \
|
||||
return a << N; \
|
||||
}
|
||||
|
||||
EIGEN_CLANG_PACKET_BITWISE_INT(Packet16i)
|
||||
EIGEN_CLANG_PACKET_BITWISE_INT(Packet8l)
|
||||
#undef EIGEN_CLANG_PACKET_BITWISE_INT
|
||||
|
||||
// Bitwise ops for floating point packets
|
||||
#define EIGEN_CLANG_PACKET_BITWISE_FLOAT(PACKET_TYPE, CAST_TO_INT, CAST_FROM_INT) \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE ptrue<PACKET_TYPE>(const PACKET_TYPE& a) { \
|
||||
return CAST_FROM_INT(CAST_TO_INT(a) == CAST_TO_INT(a)); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pand<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
|
||||
return CAST_FROM_INT(CAST_TO_INT(a) & CAST_TO_INT(b)); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE por<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
|
||||
return CAST_FROM_INT(CAST_TO_INT(a) | CAST_TO_INT(b)); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pxor<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
|
||||
return CAST_FROM_INT(CAST_TO_INT(a) ^ CAST_TO_INT(b)); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pandnot<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
|
||||
return CAST_FROM_INT(CAST_TO_INT(a) & ~CAST_TO_INT(b)); \
|
||||
}
|
||||
|
||||
EIGEN_CLANG_PACKET_BITWISE_FLOAT(Packet16f, detail::pcast_float_to_int, detail::pcast_int_to_float)
|
||||
EIGEN_CLANG_PACKET_BITWISE_FLOAT(Packet8d, detail::pcast_double_to_long, detail::pcast_long_to_double)
|
||||
#undef EIGEN_CLANG_PACKET_BITWISE_FLOAT
|
||||
|
||||
// --- Min/Max operations ---
|
||||
#if __has_builtin(__builtin_elementwise_min) && __has_builtin(__builtin_elementwise_max) && \
|
||||
__has_builtin(__builtin_elementwise_abs)
|
||||
#define EIGEN_CLANG_PACKET_ELEMENTWISE(PACKET_TYPE) \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pmin<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
|
||||
/* Match NaN propagation of std::min. */ \
|
||||
return a == a ? __builtin_elementwise_min(a, b) : a; \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pmax<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
|
||||
/* Match NaN propagation of std::max. */ \
|
||||
return a == a ? __builtin_elementwise_max(a, b) : a; \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pmin<PropagateNumbers, PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
|
||||
return __builtin_elementwise_min(a, b); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pmax<PropagateNumbers, PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
|
||||
return __builtin_elementwise_max(a, b); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pmin<PropagateNaN, PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
|
||||
return a != a ? a : (b != b ? b : __builtin_elementwise_min(a, b)); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pmax<PropagateNaN, PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
|
||||
return a != a ? a : (b != b ? b : __builtin_elementwise_max(a, b)); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pabs<PACKET_TYPE>(const PACKET_TYPE& a) { \
|
||||
return __builtin_elementwise_abs(a); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pselect<PACKET_TYPE>(const PACKET_TYPE& mask, const PACKET_TYPE& a, \
|
||||
const PACKET_TYPE& b) { \
|
||||
return __builtin_elementwise_abs(mask) == 0 ? b : a; \
|
||||
}
|
||||
|
||||
EIGEN_CLANG_PACKET_ELEMENTWISE(Packet16f)
|
||||
EIGEN_CLANG_PACKET_ELEMENTWISE(Packet8d)
|
||||
EIGEN_CLANG_PACKET_ELEMENTWISE(Packet16i)
|
||||
EIGEN_CLANG_PACKET_ELEMENTWISE(Packet8l)
|
||||
#undef EIGEN_CLANG_PACKET_ELEMENTWISE
|
||||
#endif
|
||||
|
||||
// --- Math functions (float/double only) ---
|
||||
|
||||
#if __has_builtin(__builtin_elementwise_floor) && __has_builtin(__builtin_elementwise_ceil) && \
|
||||
__has_builtin(__builtin_elementwise_round) && __has_builtin(__builtin_elementwise_roundeven) && \
|
||||
__has_builtin(__builtin_elementwise_trunc) && __has_builtin(__builtin_elementwise_sqrt)
|
||||
#define EIGEN_CLANG_PACKET_MATH_FLOAT(PACKET_TYPE) \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pfloor<PACKET_TYPE>(const PACKET_TYPE& a) { \
|
||||
return __builtin_elementwise_floor(a); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pceil<PACKET_TYPE>(const PACKET_TYPE& a) { \
|
||||
return __builtin_elementwise_ceil(a); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pround<PACKET_TYPE>(const PACKET_TYPE& a) { \
|
||||
return __builtin_elementwise_round(a); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE print<PACKET_TYPE>(const PACKET_TYPE& a) { \
|
||||
return __builtin_elementwise_roundeven(a); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE ptrunc<PACKET_TYPE>(const PACKET_TYPE& a) { \
|
||||
return __builtin_elementwise_trunc(a); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE psqrt<PACKET_TYPE>(const PACKET_TYPE& a) { \
|
||||
return __builtin_elementwise_sqrt(a); \
|
||||
}
|
||||
|
||||
EIGEN_CLANG_PACKET_MATH_FLOAT(Packet16f)
|
||||
EIGEN_CLANG_PACKET_MATH_FLOAT(Packet8d)
|
||||
#undef EIGEN_CLANG_PACKET_MATH_FLOAT
|
||||
#endif
|
||||
|
||||
// --- Fused Multiply-Add (MADD) ---
|
||||
#if defined(EIGEN_VECTORIZE_FMA) && __has_builtin(__builtin_elementwise_fma)
|
||||
#define EIGEN_CLANG_PACKET_MADD(PACKET_TYPE) \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pmadd<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b, \
|
||||
const PACKET_TYPE& c) { \
|
||||
return __builtin_elementwise_fma(a, b, c); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pmsub<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b, \
|
||||
const PACKET_TYPE& c) { \
|
||||
return __builtin_elementwise_fma(a, b, -c); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pnmadd<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b, \
|
||||
const PACKET_TYPE& c) { \
|
||||
return __builtin_elementwise_fma(-a, b, c); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pnmsub<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b, \
|
||||
const PACKET_TYPE& c) { \
|
||||
return -(__builtin_elementwise_fma(a, b, c)); \
|
||||
}
|
||||
#else
|
||||
// Fallback if FMA builtin is not available
|
||||
#define EIGEN_CLANG_PACKET_MADD(PACKET_TYPE) \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pmadd<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b, \
|
||||
const PACKET_TYPE& c) { \
|
||||
return (a * b) + c; \
|
||||
}
|
||||
#endif
|
||||
|
||||
EIGEN_CLANG_PACKET_MADD(Packet16f)
|
||||
EIGEN_CLANG_PACKET_MADD(Packet8d)
|
||||
#undef EIGEN_CLANG_PACKET_MADD
|
||||
|
||||
#define EIGEN_CLANG_PACKET_SCATTER_GATHER(PACKET_TYPE) \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE void pscatter(unpacket_traits<PACKET_TYPE>::type* to, const PACKET_TYPE& from, Index stride) { \
|
||||
constexpr int size = unpacket_traits<PACKET_TYPE>::size; \
|
||||
for (int i = 0; i < size; ++i) { \
|
||||
to[i * stride] = from[i]; \
|
||||
} \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE pgather<typename unpacket_traits<PACKET_TYPE>::type, PACKET_TYPE>( \
|
||||
const unpacket_traits<PACKET_TYPE>::type* from, Index stride) { \
|
||||
constexpr int size = unpacket_traits<PACKET_TYPE>::size; \
|
||||
unpacket_traits<PACKET_TYPE>::type arr[size]; \
|
||||
for (int i = 0; i < size; ++i) { \
|
||||
arr[i] = from[i * stride]; \
|
||||
} \
|
||||
return *reinterpret_cast<PACKET_TYPE*>(arr); \
|
||||
}
|
||||
|
||||
EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet16f)
|
||||
EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8d)
|
||||
EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet16i)
|
||||
EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8l)
|
||||
#undef EIGEN_CLANG_PACKET_SCATTER_GATHER
|
||||
|
||||
// ---- Various operations that depend on __builtin_shufflevector.
|
||||
#if __has_builtin(__builtin_shufflevector)
|
||||
namespace detail {
|
||||
template <typename Packet>
|
||||
EIGEN_STRONG_INLINE Packet preverse_impl_8(const Packet& a) {
|
||||
return __builtin_shufflevector(a, a, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_STRONG_INLINE Packet preverse_impl_16(const Packet& a) {
|
||||
return __builtin_shufflevector(a, a, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
#define EIGEN_CLANG_PACKET_REVERSE(PACKET_TYPE, SIZE) \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE PACKET_TYPE preverse<PACKET_TYPE>(const PACKET_TYPE& a) { \
|
||||
return detail::preverse_impl_##SIZE(a); \
|
||||
}
|
||||
|
||||
EIGEN_CLANG_PACKET_REVERSE(Packet16f, 16)
|
||||
EIGEN_CLANG_PACKET_REVERSE(Packet8d, 8)
|
||||
EIGEN_CLANG_PACKET_REVERSE(Packet16i, 16)
|
||||
EIGEN_CLANG_PACKET_REVERSE(Packet8l, 8)
|
||||
#undef EIGEN_CLANG_PACKET_REVERSE
|
||||
|
||||
namespace detail {
|
||||
template <typename Packet>
|
||||
EIGEN_STRONG_INLINE Packet ploaddup16(const typename unpacket_traits<Packet>::type* from) {
|
||||
static_assert((unpacket_traits<Packet>::size) % 2 == 0, "Packet size must be a multiple of 2");
|
||||
using HalfPacket = HalfPacket<Packet>;
|
||||
HalfPacket a = load_vector_unaligned<HalfPacket>(from);
|
||||
return __builtin_shufflevector(a, a, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
|
||||
}
|
||||
|
||||
template <typename Packet>
|
||||
EIGEN_STRONG_INLINE Packet ploadquad16(const typename unpacket_traits<Packet>::type* from) {
|
||||
static_assert((unpacket_traits<Packet>::size) % 4 == 0, "Packet size must be a multiple of 4");
|
||||
using QuarterPacket = QuarterPacket<Packet>;
|
||||
QuarterPacket a = load_vector_unaligned<QuarterPacket>(from);
|
||||
return __builtin_shufflevector(a, a, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3);
|
||||
}
|
||||
|
||||
template <typename Packet>
|
||||
EIGEN_STRONG_INLINE Packet ploaddup8(const typename unpacket_traits<Packet>::type* from) {
|
||||
static_assert((unpacket_traits<Packet>::size) % 2 == 0, "Packet size must be a multiple of 2");
|
||||
using HalfPacket = HalfPacket<Packet>;
|
||||
HalfPacket a = load_vector_unaligned<HalfPacket>(from);
|
||||
return __builtin_shufflevector(a, a, 0, 0, 1, 1, 2, 2, 3, 3);
|
||||
}
|
||||
|
||||
template <typename Packet>
|
||||
EIGEN_STRONG_INLINE Packet ploadquad8(const typename unpacket_traits<Packet>::type* from) {
|
||||
static_assert((unpacket_traits<Packet>::size) % 4 == 0, "Packet size must be a multiple of 4");
|
||||
using QuarterPacket = QuarterPacket<Packet>;
|
||||
QuarterPacket a = load_vector_unaligned<QuarterPacket>(from);
|
||||
return __builtin_shufflevector(a, a, 0, 0, 0, 0, 1, 1, 1, 1);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
|
||||
return detail::ploaddup16<Packet16f>(from);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
|
||||
return detail::ploaddup8<Packet8d>(from);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16i ploaddup<Packet16i>(const int32_t* from) {
|
||||
return detail::ploaddup16<Packet16i>(from);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8l ploaddup<Packet8l>(const int64_t* from) {
|
||||
return detail::ploaddup8<Packet8l>(from);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
|
||||
return detail::ploadquad16<Packet16f>(from);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) {
|
||||
return detail::ploadquad8<Packet8d>(from);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16i ploadquad<Packet16i>(const int32_t* from) {
|
||||
return detail::ploadquad16<Packet16i>(from);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8l ploadquad<Packet8l>(const int64_t* from) {
|
||||
return detail::ploadquad8<Packet8l>(from);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f plset<Packet16f>(const float& a) {
|
||||
Packet16f x{a + 0.0f, a + 1.0f, a + 2.0f, a + 3.0f, a + 4.0f, a + 5.0f, a + 6.0f, a + 7.0f,
|
||||
a + 8.0f, a + 9.0f, a + 10.0f, a + 11.0f, a + 12.0f, a + 13.0f, a + 14.0f, a + 15.0f};
|
||||
return x;
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d plset<Packet8d>(const double& a) {
|
||||
return Packet8d{a + 0.0, a + 1.0, a + 2.0, a + 3.0, a + 4.0, a + 5.0, a + 6.0, a + 7.0};
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16i plset<Packet16i>(const int32_t& a) {
|
||||
return Packet16i{a + 0, a + 1, a + 2, a + 3, a + 4, a + 5, a + 6, a + 7,
|
||||
a + 8, a + 9, a + 10, a + 11, a + 12, a + 13, a + 14, a + 15};
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8l plset<Packet8l>(const int64_t& a) {
|
||||
return Packet8l{a + 0, a + 1, a + 2, a + 3, a + 4, a + 5, a + 6, a + 7};
|
||||
}
|
||||
|
||||
// Helpers for ptranspose.
|
||||
namespace detail {
|
||||
|
||||
template <typename Packet>
|
||||
EIGEN_ALWAYS_INLINE void zip_in_place16(Packet& p1, Packet& p2) {
|
||||
Packet tmp = __builtin_shufflevector(p1, p2, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
|
||||
p2 = __builtin_shufflevector(p1, p2, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
|
||||
p1 = tmp;
|
||||
}
|
||||
|
||||
template <typename Packet>
|
||||
EIGEN_ALWAYS_INLINE void zip_in_place8(Packet& p1, Packet& p2) {
|
||||
Packet tmp = __builtin_shufflevector(p1, p2, 0, 8, 1, 9, 2, 10, 3, 11);
|
||||
p2 = __builtin_shufflevector(p1, p2, 4, 12, 5, 13, 6, 14, 7, 15);
|
||||
p1 = tmp;
|
||||
}
|
||||
|
||||
template <typename Packet>
|
||||
void zip_in_place(Packet& p1, Packet& p2);
|
||||
|
||||
template <>
|
||||
EIGEN_ALWAYS_INLINE void zip_in_place<Packet16f>(Packet16f& p1, Packet16f& p2) {
|
||||
zip_in_place16(p1, p2);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_ALWAYS_INLINE void zip_in_place<Packet8d>(Packet8d& p1, Packet8d& p2) {
|
||||
zip_in_place8(p1, p2);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_ALWAYS_INLINE void zip_in_place<Packet16i>(Packet16i& p1, Packet16i& p2) {
|
||||
zip_in_place16(p1, p2);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_ALWAYS_INLINE void zip_in_place<Packet8l>(Packet8l& p1, Packet8l& p2) {
|
||||
zip_in_place8(p1, p2);
|
||||
}
|
||||
|
||||
template <typename Packet>
|
||||
EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 2>& kernel) {
|
||||
zip_in_place(kernel.packet[0], kernel.packet[1]);
|
||||
}
|
||||
|
||||
template <typename Packet>
|
||||
EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 4>& kernel) {
|
||||
zip_in_place(kernel.packet[0], kernel.packet[2]);
|
||||
zip_in_place(kernel.packet[1], kernel.packet[3]);
|
||||
zip_in_place(kernel.packet[0], kernel.packet[1]);
|
||||
zip_in_place(kernel.packet[2], kernel.packet[3]);
|
||||
}
|
||||
|
||||
template <typename Packet>
|
||||
EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 8>& kernel) {
|
||||
zip_in_place(kernel.packet[0], kernel.packet[4]);
|
||||
zip_in_place(kernel.packet[1], kernel.packet[5]);
|
||||
zip_in_place(kernel.packet[2], kernel.packet[6]);
|
||||
zip_in_place(kernel.packet[3], kernel.packet[7]);
|
||||
|
||||
zip_in_place(kernel.packet[0], kernel.packet[2]);
|
||||
zip_in_place(kernel.packet[1], kernel.packet[3]);
|
||||
zip_in_place(kernel.packet[4], kernel.packet[6]);
|
||||
zip_in_place(kernel.packet[5], kernel.packet[7]);
|
||||
|
||||
zip_in_place(kernel.packet[0], kernel.packet[1]);
|
||||
zip_in_place(kernel.packet[2], kernel.packet[3]);
|
||||
zip_in_place(kernel.packet[4], kernel.packet[5]);
|
||||
zip_in_place(kernel.packet[6], kernel.packet[7]);
|
||||
}
|
||||
|
||||
template <typename Packet>
|
||||
EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 16>& kernel) {
|
||||
EIGEN_UNROLL_LOOP
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
const int m = (1 << i);
|
||||
EIGEN_UNROLL_LOOP
|
||||
for (int j = 0; j < m; ++j) {
|
||||
const int n = (1 << (3 - i));
|
||||
EIGEN_UNROLL_LOOP
|
||||
for (int k = 0; k < n; ++k) {
|
||||
const int idx = 2 * j * n + k;
|
||||
zip_in_place(kernel.packet[idx], kernel.packet[idx + n]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
|
||||
detail::ptranspose_impl(kernel);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 8>& kernel) {
|
||||
detail::ptranspose_impl(kernel);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
|
||||
detail::ptranspose_impl(kernel);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 2>& kernel) {
|
||||
detail::ptranspose_impl(kernel);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8d, 8>& kernel) {
|
||||
detail::ptranspose_impl(kernel);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8d, 4>& kernel) {
|
||||
detail::ptranspose_impl(kernel);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8d, 2>& kernel) {
|
||||
detail::ptranspose_impl(kernel);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16i, 16>& kernel) {
|
||||
detail::ptranspose_impl(kernel);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16i, 8>& kernel) {
|
||||
detail::ptranspose_impl(kernel);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16i, 4>& kernel) {
|
||||
detail::ptranspose_impl(kernel);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16i, 2>& kernel) {
|
||||
detail::ptranspose_impl(kernel);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8l, 8>& kernel) {
|
||||
detail::ptranspose_impl(kernel);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8l, 4>& kernel) {
|
||||
detail::ptranspose_impl(kernel);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8l, 2>& kernel) {
|
||||
detail::ptranspose_impl(kernel);
|
||||
}
|
||||
#endif
|
||||
|
||||
} // end namespace internal
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_PACKET_MATH_CLANG_H
|
||||
112
Eigen/src/Core/arch/clang/Reductions.h
Normal file
112
Eigen/src/Core/arch/clang/Reductions.h
Normal file
@@ -0,0 +1,112 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2025 Rasmus Munk Larsen
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_REDUCTIONS_CLANG_H
|
||||
#define EIGEN_REDUCTIONS_CLANG_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
// --- Reductions ---
|
||||
#if __has_builtin(__builtin_reduce_min) && __has_builtin(__builtin_reduce_max) && __has_builtin(__builtin_reduce_or)
|
||||
#define EIGEN_CLANG_PACKET_REDUX_MINMAX(PACKET_TYPE) \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE unpacket_traits<PACKET_TYPE>::type predux_min(const PACKET_TYPE& a) { \
|
||||
return __builtin_reduce_min(a); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE unpacket_traits<PACKET_TYPE>::type predux_max(const PACKET_TYPE& a) { \
|
||||
return __builtin_reduce_max(a); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE bool predux_any(const PACKET_TYPE& a) { \
|
||||
return __builtin_reduce_or(a != 0) != 0; \
|
||||
}
|
||||
|
||||
EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet16f)
|
||||
EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet8d)
|
||||
EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet16i)
|
||||
EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet8l)
|
||||
#undef EIGEN_CLANG_PACKET_REDUX_MINMAX
|
||||
#endif
|
||||
|
||||
#if __has_builtin(__builtin_reduce_add) && __has_builtin(__builtin_reduce_mul)
|
||||
#define EIGEN_CLANG_PACKET_REDUX_INT(PACKET_TYPE) \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE unpacket_traits<PACKET_TYPE>::type predux<PACKET_TYPE>(const PACKET_TYPE& a) { \
|
||||
return __builtin_reduce_add(a); \
|
||||
} \
|
||||
template <> \
|
||||
EIGEN_STRONG_INLINE unpacket_traits<PACKET_TYPE>::type predux_mul<PACKET_TYPE>(const PACKET_TYPE& a) { \
|
||||
return __builtin_reduce_mul(a); \
|
||||
}
|
||||
|
||||
// __builtin_reduce_{mul,add} are only defined for integer types.
|
||||
EIGEN_CLANG_PACKET_REDUX_INT(Packet16i)
|
||||
EIGEN_CLANG_PACKET_REDUX_INT(Packet8l)
|
||||
#undef EIGEN_CLANG_PACKET_REDUX_INT
|
||||
#endif
|
||||
|
||||
#if __has_builtin(__builtin_shufflevector)
|
||||
namespace detail {
|
||||
template <typename VectorT>
|
||||
EIGEN_STRONG_INLINE scalar_type_of_vector_t<VectorT> ReduceAdd16(const VectorT& a) {
|
||||
auto t1 = __builtin_shufflevector(a, a, 0, 2, 4, 6, 8, 10, 12, 14) +
|
||||
__builtin_shufflevector(a, a, 1, 3, 5, 7, 9, 11, 13, 15);
|
||||
auto t2 = __builtin_shufflevector(t1, t1, 0, 2, 4, 6) + __builtin_shufflevector(t1, t1, 1, 3, 5, 7);
|
||||
auto t3 = __builtin_shufflevector(t2, t2, 0, 2) + __builtin_shufflevector(t2, t2, 1, 3);
|
||||
return t3[0] + t3[1];
|
||||
}
|
||||
|
||||
template <typename VectorT>
|
||||
EIGEN_STRONG_INLINE scalar_type_of_vector_t<VectorT> ReduceAdd8(const VectorT& a) {
|
||||
auto t1 = __builtin_shufflevector(a, a, 0, 2, 4, 6) + __builtin_shufflevector(a, a, 1, 3, 5, 7);
|
||||
auto t2 = __builtin_shufflevector(t1, t1, 0, 2) + __builtin_shufflevector(t1, t1, 1, 3);
|
||||
return t2[0] + t2[1];
|
||||
}
|
||||
|
||||
template <typename VectorT>
|
||||
EIGEN_STRONG_INLINE scalar_type_of_vector_t<VectorT> ReduceMul16(const VectorT& a) {
|
||||
auto t1 = __builtin_shufflevector(a, a, 0, 2, 4, 6, 8, 10, 12, 14) *
|
||||
__builtin_shufflevector(a, a, 1, 3, 5, 7, 9, 11, 13, 15);
|
||||
auto t2 = __builtin_shufflevector(t1, t1, 0, 2, 4, 6) * __builtin_shufflevector(t1, t1, 1, 3, 5, 7);
|
||||
auto t3 = __builtin_shufflevector(t2, t2, 0, 2) * __builtin_shufflevector(t2, t2, 1, 3);
|
||||
return t3[0] * t3[1];
|
||||
}
|
||||
|
||||
template <typename VectorT>
|
||||
EIGEN_STRONG_INLINE scalar_type_of_vector_t<VectorT> ReduceMul8(const VectorT& a) {
|
||||
auto t1 = __builtin_shufflevector(a, a, 0, 2, 4, 6) * __builtin_shufflevector(a, a, 1, 3, 5, 7);
|
||||
auto t2 = __builtin_shufflevector(t1, t1, 0, 2) * __builtin_shufflevector(t1, t1, 1, 3);
|
||||
return t2[0] * t2[1];
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
|
||||
return detail::ReduceAdd16(a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
|
||||
return detail::ReduceAdd8(a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
|
||||
return detail::ReduceMul16(a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
|
||||
return detail::ReduceMul8(a);
|
||||
}
|
||||
#endif
|
||||
|
||||
} // end namespace internal
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_REDUCTIONS_CLANG_H
|
||||
63
Eigen/src/Core/arch/clang/TypeCasting.h
Normal file
63
Eigen/src/Core/arch/clang/TypeCasting.h
Normal file
@@ -0,0 +1,63 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2025 Rasmus Munk Larsen
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_TYPE_CASTING_CLANG_H
|
||||
#define EIGEN_TYPE_CASTING_CLANG_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
//==============================================================================
|
||||
// preinterpret
|
||||
//==============================================================================
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet16i>(const Packet16i& a) {
|
||||
return reinterpret_cast<Packet16f>(a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16i preinterpret<Packet16i, Packet16f>(const Packet16f& a) {
|
||||
return reinterpret_cast<Packet16i>(a);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d preinterpret<Packet8d, Packet8l>(const Packet8l& a) {
|
||||
return reinterpret_cast<Packet8d>(a);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8l preinterpret<Packet8l, Packet8d>(const Packet8d& a) {
|
||||
return reinterpret_cast<Packet8l>(a);
|
||||
}
|
||||
|
||||
//==============================================================================
|
||||
// pcast
|
||||
//==============================================================================
|
||||
#if __has_builtin(__builtin_convertvector)
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16i pcast<Packet16f, Packet16i>(const Packet16f& a) {
|
||||
return __builtin_convertvector(a, Packet16i);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pcast<Packet16i, Packet16f>(const Packet16i& a) {
|
||||
return __builtin_convertvector(a, Packet16f);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8l pcast<Packet8d, Packet8l>(const Packet8d& a) {
|
||||
return __builtin_convertvector(a, Packet8l);
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d pcast<Packet8l, Packet8d>(const Packet8l& a) {
|
||||
return __builtin_convertvector(a, Packet8d);
|
||||
}
|
||||
#endif
|
||||
|
||||
} // end namespace internal
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_TYPE_CASTING_CLANG_H
|
||||
@@ -642,25 +642,21 @@ DoublePacket<Packet> padd(const DoublePacket<Packet>& a, const DoublePacket<Pack
|
||||
return res;
|
||||
}
|
||||
|
||||
// note that for DoublePacket<RealPacket> the "4" in "downto4"
|
||||
// corresponds to the number of complexes, so it means "8"
|
||||
// it terms of real coefficients.
|
||||
|
||||
template <typename Packet>
|
||||
const DoublePacket<Packet>& predux_half_dowto4(const DoublePacket<Packet>& a,
|
||||
std::enable_if_t<unpacket_traits<Packet>::size <= 8>* = 0) {
|
||||
const DoublePacket<Packet>& predux_half(const DoublePacket<Packet>& a,
|
||||
std::enable_if_t<unpacket_traits<Packet>::size <= 8>* = 0) {
|
||||
return a;
|
||||
}
|
||||
|
||||
template <typename Packet>
|
||||
DoublePacket<typename unpacket_traits<Packet>::half> predux_half_dowto4(
|
||||
DoublePacket<typename unpacket_traits<Packet>::half> predux_half(
|
||||
const DoublePacket<Packet>& a, std::enable_if_t<unpacket_traits<Packet>::size == 16>* = 0) {
|
||||
// yes, that's pretty hackish :(
|
||||
DoublePacket<typename unpacket_traits<Packet>::half> res;
|
||||
typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
|
||||
typedef typename packet_traits<Cplx>::type CplxPacket;
|
||||
res.first = predux_half_dowto4(CplxPacket(a.first)).v;
|
||||
res.second = predux_half_dowto4(CplxPacket(a.second)).v;
|
||||
res.first = predux_half(CplxPacket(a.first)).v;
|
||||
res.second = predux_half(CplxPacket(a.second)).v;
|
||||
return res;
|
||||
}
|
||||
|
||||
@@ -1067,7 +1063,7 @@ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr,
|
||||
if (depth - endk > 0) {
|
||||
// We have to handle the last row(s) of the rhs, which
|
||||
// correspond to a half-packet
|
||||
SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0));
|
||||
SAccPacketQuarter c0 = predux_half(predux_half(C0));
|
||||
|
||||
for (Index kk = endk; kk < depth; kk++) {
|
||||
SLhsPacketQuarter a0;
|
||||
@@ -1080,7 +1076,7 @@ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr,
|
||||
}
|
||||
straits.acc(c0, alphav, R);
|
||||
} else {
|
||||
straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R);
|
||||
straits.acc(predux_half(predux_half(C0)), alphav, R);
|
||||
}
|
||||
res.scatterPacket(i, j2, R);
|
||||
}
|
||||
@@ -2473,11 +2469,11 @@ EIGEN_DONT_INLINE void gebp_kernel<LhsScalar, RhsScalar, Index, DataMapper, mr,
|
||||
SRhsPacketHalf b0;
|
||||
straits.loadLhsUnaligned(blB, a0);
|
||||
straits.loadRhs(blA, b0);
|
||||
SAccPacketHalf c0 = predux_half_dowto4(C0);
|
||||
SAccPacketHalf c0 = predux_half(C0);
|
||||
straits.madd(a0, b0, c0, b0, fix<0>);
|
||||
straits.acc(c0, alphav, R);
|
||||
} else {
|
||||
straits.acc(predux_half_dowto4(C0), alphav, R);
|
||||
straits.acc(predux_half(C0), alphav, R);
|
||||
}
|
||||
res.scatterPacket(i, j2, R);
|
||||
} else if (SwappedTraits::LhsProgress == 16) {
|
||||
|
||||
@@ -11,6 +11,18 @@
|
||||
#ifndef EIGEN_CONFIGURE_VECTORIZATION_H
|
||||
#define EIGEN_CONFIGURE_VECTORIZATION_H
|
||||
|
||||
// Prepare for using the generic clang backend if requested.
|
||||
#if defined(EIGEN_VECTORIZE_GENERIC) && !defined(EIGEN_DONT_VECTORIZE) && !defined(EIGEN_DONT_ALIGN)
|
||||
#if !EIGEN_ARCH_VECTOR_EXTENSIONS
|
||||
#error "The compiler does not support clang vector extensions."
|
||||
#endif
|
||||
#define EIGEN_VECTORIZE
|
||||
#ifndef EIGEN_GENERIC_VECTOR_SIZE_BYTES
|
||||
#define EIGEN_GENERIC_VECTOR_SIZE_BYTES 64
|
||||
#endif
|
||||
#define EIGEN_MAX_ALIGN_BYTES EIGEN_GENERIC_VECTOR_SIZE_BYTES
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------------------
|
||||
// Static and dynamic alignment control
|
||||
//
|
||||
@@ -504,7 +516,7 @@ extern "C" {
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
inline static const char *SimdInstructionSetsInUse(void) {
|
||||
inline static const char* SimdInstructionSetsInUse(void) {
|
||||
#if defined(EIGEN_VECTORIZE_AVX512)
|
||||
return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
|
||||
#elif defined(EIGEN_VECTORIZE_AVX)
|
||||
|
||||
@@ -830,6 +830,13 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Does the compiler support vector types?
|
||||
#if __has_attribute(ext_vector_type) && __has_builtin(__builtin_vectorelements)
|
||||
#define EIGEN_ARCH_VECTOR_EXTENSIONS 1
|
||||
#else
|
||||
#define EIGEN_ARCH_VECTOR_EXTENSIONS 0
|
||||
#endif
|
||||
|
||||
// Multidimensional subscript operator feature test
|
||||
#if defined(__cpp_multidimensional_subscript) && __cpp_multidimensional_subscript >= 202110L
|
||||
#define EIGEN_MULTIDIMENSIONAL_SUBSCRIPT
|
||||
|
||||
@@ -685,8 +685,8 @@ void packetmath() {
|
||||
int HalfPacketSize = PacketSize > 4 ? PacketSize / 2 : PacketSize;
|
||||
for (int i = 0; i < HalfPacketSize; ++i) ref[i] = Scalar(0);
|
||||
for (int i = 0; i < PacketSize; ++i) ref[i % HalfPacketSize] += data1[i];
|
||||
internal::pstore(data2, internal::predux_half_dowto4(internal::pload<Packet>(data1)));
|
||||
VERIFY(test::areApprox(ref, data2, HalfPacketSize) && "internal::predux_half_dowto4");
|
||||
internal::pstore(data2, internal::predux_half(internal::pload<Packet>(data1)));
|
||||
VERIFY(test::areApprox(ref, data2, HalfPacketSize) && "internal::predux_half");
|
||||
}
|
||||
|
||||
// Avoid overflows.
|
||||
|
||||
@@ -77,6 +77,8 @@ namespace Eigen {
|
||||
#include "src/SpecialFunctions/SpecialFunctionsFunctors.h"
|
||||
#include "src/SpecialFunctions/SpecialFunctionsArrayAPI.h"
|
||||
|
||||
#ifndef EIGEN_VECTORIZE_GENERIC
|
||||
// TODO(rmlarsen): Make these work with generic vectorization if possible.
|
||||
#if defined EIGEN_VECTORIZE_AVX512
|
||||
#include "src/SpecialFunctions/arch/AVX/BesselFunctions.h"
|
||||
#include "src/SpecialFunctions/arch/AVX/SpecialFunctions.h"
|
||||
@@ -93,6 +95,7 @@ namespace Eigen {
|
||||
#if defined EIGEN_VECTORIZE_GPU
|
||||
#include "src/SpecialFunctions/arch/GPU/SpecialFunctions.h"
|
||||
#endif
|
||||
#endif
|
||||
// IWYU pragma: end_exports
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
Reference in New Issue
Block a user