diff --git a/Eigen/Core b/Eigen/Core index 59f12eb54..5f46dde21 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -208,6 +208,7 @@ using std::ptrdiff_t; #if defined(EIGEN_VECTORIZE_GENERIC) && !defined(EIGEN_DONT_VECTORIZE) #include "src/Core/arch/clang/PacketMath.h" #include "src/Core/arch/clang/TypeCasting.h" +#include "src/Core/arch/clang/Complex.h" #include "src/Core/arch/clang/Reductions.h" #include "src/Core/arch/clang/MathFunctions.h" #else diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index ba38cd1fe..dc3e03d21 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -1010,12 +1010,26 @@ EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a) { return a; } -/** \internal \returns \a a with real and imaginary part flipped (for complex type only) */ +/** \internal \returns \a a with real and imaginary parts flipped (for complex types only) */ template EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a) { return Packet(numext::imag(a), numext::real(a)); } +/** \internal \returns \a a with real part duplicated (for complex types only) */ +// TODO(rmlarsen): Define and use in all complex backends. +template +EIGEN_DEVICE_FUNC inline Packet pdupreal(const Packet& a) { + return Packet(numext::real(a), numext::real(a)); +} + +/** \internal \returns \a a with imaginary part duplicated (for complex types only) */ +// TODO(rmlarsen): Define and use in all complex backends. +template +EIGEN_DEVICE_FUNC inline Packet pdupimag(const Packet& a) { + return Packet(numext::imag(a), numext::imag(a)); +} + /************************** * Special math functions ***************************/ diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 827386fb0..96d278386 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -1130,6 +1130,20 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_double(const Pac return psincos_double(x); } +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS + std::enable_if_t::type, float>::value, Packet> + psincos_selector(const Packet& x) { + return psincos_float(x); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS + std::enable_if_t::type, double>::value, Packet> + psincos_selector(const Packet& x) { + return psincos_double(x); +} + // Generic implementation of acos(x). template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos_float(const Packet& x_in) { @@ -1469,15 +1483,25 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdiv_complex(const Pa return Packet(pdiv(result_scaled.v, y_max)); } +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pmul_complex(const Packet& x, const Packet& y) { + // In the following we annotate the code for the case where the inputs + // are a pair length-2 SIMD vectors representing a single pair of complex + // numbers x = a + i*b, y = c + i*d. + Packet x_re = pdupreal(x); // a, a + Packet x_im = pdupimag(x); // b, b + Packet tmp_re = Packet(pmul(x_re.v, y.v)); // a*c, a*d + Packet tmp_im = Packet(pmul(x_im.v, y.v)); // b*c, b*d + tmp_im = pcplxflip(pconj(tmp_im)); // -b*d, d*c + return padd(tmp_im, tmp_re); // a*c - b*d, a*d + b*c +} + template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_complex(const Packet& x) { typedef typename unpacket_traits::type Scalar; typedef typename Scalar::value_type RealScalar; typedef typename unpacket_traits::as_real RealPacket; - RealPacket real_mask_rp = peven_mask(x.v); - Packet real_mask(real_mask_rp); - // Real part RealPacket x_flip = pcplxflip(x).v; // b, a Packet x_norm = phypot_complex(x); // sqrt(a^2 + b^2), sqrt(a^2 + b^2) @@ -1493,12 +1517,12 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_complex(const Pa RealPacket is_any_inf = por(is_x_pos_inf, is_y_pos_inf); RealPacket xreal = pselect(is_any_inf, cst_pos_inf, xlogr); - Packet xres = pselect(real_mask, Packet(xreal), Packet(ximg)); // log(sqrt(a^2 + b^2)), atan2(b, a) - return xres; + return Packet(pselect(peven_mask(xreal), xreal, ximg)); // log(sqrt(a^2 + b^2)), atan2(b, a) } template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_complex(const Packet& a) { + // FIXME(rmlarsen): This does not work correctly for Packets of std::complex. typedef typename unpacket_traits::as_real RealPacket; typedef typename unpacket_traits::type Scalar; typedef typename Scalar::value_type RealScalar; @@ -1516,7 +1540,7 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_complex(const Pa // cis(y): RealPacket y = pand(odd_mask, a.v); y = por(y, pcplxflip(Packet(y)).v); - RealPacket cisy = psincos_float(y); + RealPacket cisy = psincos_selector(y); cisy = pcplxflip(Packet(cisy)).v; // cos(y) + i * sin(y) const RealPacket cst_pos_inf = pset1(NumTraits::infinity()); diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h index de30bcb25..44ccb745b 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h @@ -150,6 +150,10 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psqrt_complex(const P template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdiv_complex(const Packet& x, const Packet& y); +/** \internal \returns x * y for complex types */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pmul_complex(const Packet& x, const Packet& y); + template struct ppolevl; diff --git a/Eigen/src/Core/arch/clang/Complex.h b/Eigen/src/Core/arch/clang/Complex.h new file mode 100644 index 000000000..d6cc435a6 --- /dev/null +++ b/Eigen/src/Core/arch/clang/Complex.h @@ -0,0 +1,404 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2025 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLEX_CLANG_H +#define EIGEN_COMPLEX_CLANG_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +template +struct complex_packet_wrapper { + using RealPacketT = detail::VectorType; + EIGEN_STRONG_INLINE complex_packet_wrapper() = default; + EIGEN_STRONG_INLINE explicit complex_packet_wrapper(const RealPacketT& a) : v(a) {} + EIGEN_STRONG_INLINE constexpr std::complex operator[](Index i) const { + return std::complex(v[2 * i], v[2 * i + 1]); + } + RealPacketT v; +}; + +using Packet8cf = complex_packet_wrapper; +using Packet4cf = complex_packet_wrapper; +using Packet2cf = complex_packet_wrapper; +using Packet4cd = complex_packet_wrapper; +using Packet2cd = complex_packet_wrapper; + +struct generic_complex_packet_traits : default_packet_traits { + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasArg = 0, + HasSetLinear = 0, + HasConj = 1, + // Math functions + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + }; +}; + +template <> +struct packet_traits> : generic_complex_packet_traits { + using type = Packet8cf; + using half = Packet8cf; + enum { + size = 8, + }; +}; + +template <> +struct unpacket_traits : generic_unpacket_traits { + using type = std::complex; + using half = Packet8cf; + using as_real = Packet16f; + enum { + size = 8, + }; +}; + +template <> +struct packet_traits> : generic_complex_packet_traits { + using type = Packet4cd; + using half = Packet4cd; + enum { + size = 4, + HasExp = 0, // FIXME(rmlarsen): pexp_complex is broken for double. + }; +}; + +template <> +struct unpacket_traits : generic_unpacket_traits { + using type = std::complex; + using half = Packet4cd; + using as_real = Packet8d; + enum { + size = 4, + }; +}; + +// ------------ Load and store ops ---------- +#define EIGEN_CLANG_COMPLEX_LOAD_STORE(PACKET_TYPE) \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE ploadu(const unpacket_traits::type* from) { \ + return PACKET_TYPE(ploadu::as_real>(&numext::real_ref(*from))); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pload(const unpacket_traits::type* from) { \ + return PACKET_TYPE(pload::as_real>(&numext::real_ref(*from))); \ + } \ + template <> \ + EIGEN_STRONG_INLINE void pstoreu::type, PACKET_TYPE>( \ + typename unpacket_traits::type * to, const PACKET_TYPE& from) { \ + pstoreu(&numext::real_ref(*to), from.v); \ + } \ + template <> \ + EIGEN_STRONG_INLINE void pstore::type, PACKET_TYPE>( \ + typename unpacket_traits::type * to, const PACKET_TYPE& from) { \ + pstore(&numext::real_ref(*to), from.v); \ + } + +EIGEN_CLANG_COMPLEX_LOAD_STORE(Packet8cf); +EIGEN_CLANG_COMPLEX_LOAD_STORE(Packet4cd); +#undef EIGEN_CLANG_COMPLEX_LOAD_STORE + +template <> +EIGEN_STRONG_INLINE Packet8cf pset1(const std::complex& from) { + const float re = numext::real(from); + const float im = numext::imag(from); + return Packet8cf(Packet16f{re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im}); +} + +template <> +EIGEN_STRONG_INLINE Packet4cd pset1(const std::complex& from) { + const double re = numext::real(from); + const double im = numext::imag(from); + return Packet4cd(Packet8d{re, im, re, im, re, im, re, im}); +} + +// ----------- Unary ops ------------------ +#define DELEGATE_UNARY_TO_REAL_OP(PACKET_TYPE, OP) \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE OP(const PACKET_TYPE& a) { \ + return PACKET_TYPE(OP(a.v)); \ + } + +#define EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(PACKET_TYPE) \ + DELEGATE_UNARY_TO_REAL_OP(PACKET_TYPE, pnegate) \ + DELEGATE_UNARY_TO_REAL_OP(PACKET_TYPE, pzero) \ + template <> \ + EIGEN_STRONG_INLINE unpacket_traits::type pfirst(const PACKET_TYPE& a) { \ + return a[0]; \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pexp(const PACKET_TYPE& a) { \ + return pexp_complex(a); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE plog(const PACKET_TYPE& a) { \ + return plog_complex(a); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE psqrt(const PACKET_TYPE& a) { \ + return psqrt_complex(a); \ + } + +EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(Packet8cf); +EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(Packet4cd); + +template <> +EIGEN_STRONG_INLINE Packet8cf pconj(const Packet8cf& a) { + return Packet8cf(__builtin_shufflevector(a.v, -a.v, 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31)); +} +template <> +EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a) { + return Packet4cf(__builtin_shufflevector(a.v, -a.v, 0, 9, 2, 11, 4, 13, 6, 15)); +} +template <> +EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { + return Packet2cf(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7)); +} + +template <> +EIGEN_STRONG_INLINE Packet4cd pconj(const Packet4cd& a) { + return Packet4cd(__builtin_shufflevector(a.v, -a.v, 0, 9, 2, 11, 4, 13, 6, 15)); +} +template <> +EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a) { + return Packet2cd(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7)); +} + +#undef DELEGATE_UNARY_TO_REAL_OP +#undef EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS + +// Flip real and imaginary parts, i.e. {re(a), im(a)} -> {im(a), re(a)}. +template <> +EIGEN_STRONG_INLINE Packet8cf pcplxflip(const Packet8cf& a) { + return Packet8cf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14)); +} +template <> +EIGEN_STRONG_INLINE Packet4cf pcplxflip(const Packet4cf& a) { + return Packet4cf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6)); +} +template <> +EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& a) { + return Packet2cf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2)); +} +template <> +EIGEN_STRONG_INLINE Packet4cd pcplxflip(const Packet4cd& a) { + return Packet4cd(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6)); +} +template <> +EIGEN_STRONG_INLINE Packet2cd pcplxflip(const Packet2cd& a) { + return Packet2cd(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2)); +} + +// Copy real to imaginary part, i.e. {re(a), im(a)} -> {re(a), re(a)}. +template <> +EIGEN_STRONG_INLINE Packet8cf pdupreal(const Packet8cf& a) { + return Packet8cf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14)); +} +template <> +EIGEN_STRONG_INLINE Packet4cf pdupreal(const Packet4cf& a) { + return Packet4cf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6)); +} +template <> +EIGEN_STRONG_INLINE Packet2cf pdupreal(const Packet2cf& a) { + return Packet2cf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2)); +} +template <> +EIGEN_STRONG_INLINE Packet4cd pdupreal(const Packet4cd& a) { + return Packet4cd(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6)); +} +template <> +EIGEN_STRONG_INLINE Packet2cd pdupreal(const Packet2cd& a) { + return Packet2cd(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2)); +} + +// Copy imaginary to real part, i.e. {re(a), im(a)} -> {im(a), im(a)}. +template <> +EIGEN_STRONG_INLINE Packet8cf pdupimag(const Packet8cf& a) { + return Packet8cf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15)); +} +template <> +EIGEN_STRONG_INLINE Packet4cf pdupimag(const Packet4cf& a) { + return Packet4cf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7)); +} +template <> +EIGEN_STRONG_INLINE Packet2cf pdupimag(const Packet2cf& a) { + return Packet2cf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3)); +} +template <> +EIGEN_STRONG_INLINE Packet4cd pdupimag(const Packet4cd& a) { + return Packet4cd(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7)); +} +template <> +EIGEN_STRONG_INLINE Packet2cd pdupimag(const Packet2cd& a) { + return Packet2cd(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3)); +} + +template <> +EIGEN_STRONG_INLINE Packet8cf ploaddup(const std::complex* from) { + return Packet8cf(Packet16f{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]), + std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1]), + std::real(from[2]), std::imag(from[2]), std::real(from[2]), std::imag(from[2]), + std::real(from[3]), std::imag(from[3]), std::real(from[3]), std::imag(from[3])}); +} +template <> +EIGEN_STRONG_INLINE Packet4cd ploaddup(const std::complex* from) { + return Packet4cd(Packet8d{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]), + std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1])}); +} + +template <> +EIGEN_STRONG_INLINE Packet8cf ploadquad(const std::complex* from) { + return Packet8cf(Packet16f{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]), + std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]), + std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1]), + std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1])}); +} +template <> +EIGEN_STRONG_INLINE Packet4cd ploadquad(const std::complex* from) { + return pset1(*from); +} + +template <> +EIGEN_STRONG_INLINE Packet8cf preverse(const Packet8cf& a) { + return Packet8cf(reinterpret_cast(preverse(reinterpret_cast(a.v)))); +} +template <> +EIGEN_STRONG_INLINE Packet4cd preverse(const Packet4cd& a) { + return Packet4cd(__builtin_shufflevector(a.v, a.v, 6, 7, 4, 5, 2, 3, 0, 1)); +} + +// ----------- Binary ops ------------------ +#define DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, OP) \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE OP(const PACKET_TYPE& a, const PACKET_TYPE& b) { \ + return PACKET_TYPE(OP(a.v, b.v)); \ + } + +#define EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(PACKET_TYPE) \ + DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, psub) \ + DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, pand) \ + DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, por) \ + DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, pxor) \ + DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, pandnot) \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pdiv(const PACKET_TYPE& a, const PACKET_TYPE& b) { \ + return pdiv_complex(a, b); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pcmp_eq(const PACKET_TYPE& a, const PACKET_TYPE& b) { \ + const PACKET_TYPE t = PACKET_TYPE(pcmp_eq(a.v, b.v)); \ + return PACKET_TYPE(pand(pdupreal(t).v, pdupimag(t).v)); \ + } + +EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(Packet8cf); +EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(Packet4cd); + +// Binary ops that are needed on sub-packets for predux and predux_mul. +#define EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(PACKET_TYPE) \ + DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, padd) \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pmul(const PACKET_TYPE& a, const PACKET_TYPE& b) { \ + return pmul_complex(a, b); \ + } + +EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet8cf); +EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet4cf); +EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet2cf); +EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet4cd); +EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet2cd); + +#define EIGEN_CLANG_PACKET_SCATTER_GATHER(PACKET_TYPE) \ + template <> \ + EIGEN_STRONG_INLINE void pscatter(unpacket_traits::type* to, const PACKET_TYPE& from, Index stride) { \ + constexpr int size = unpacket_traits::size; \ + for (int i = 0; i < size; ++i) { \ + to[i * stride] = from[i]; \ + } \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pgather::type, PACKET_TYPE>( \ + const unpacket_traits::type* from, Index stride) { \ + constexpr int size = unpacket_traits::size; \ + PACKET_TYPE result; \ + for (int i = 0; i < size; ++i) { \ + const unpacket_traits::type from_i = from[i * stride]; \ + result.v[2 * i] = numext::real(from_i); \ + result.v[2 * i + 1] = numext::imag(from_i); \ + } \ + return result; \ + } + +EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8cf); +EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet4cd); +#undef EIGEN_CLANG_PACKET_SCATTER_GATHER + +#undef DELEGATE_BINARY_TO_REAL_OP +#undef EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS +#undef EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS + +// ------------ ternary ops ------------- +template <> +EIGEN_STRONG_INLINE Packet8cf pselect(const Packet8cf& mask, const Packet8cf& a, const Packet8cf& b) { + return Packet8cf(reinterpret_cast( + pselect(reinterpret_cast(mask.v), reinterpret_cast(a.v), reinterpret_cast(b.v)))); +} + +namespace detail { +template <> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet8cf& p1, Packet8cf& p2) { + Packet16f tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23); + p2.v = __builtin_shufflevector(p1.v, p2.v, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31); + p1.v = tmp; +} + +template <> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4cd& p1, Packet4cd& p2) { + Packet8d tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 8, 9, 2, 3, 10, 11); + p2.v = __builtin_shufflevector(p1.v, p2.v, 4, 5, 12, 13, 6, 7, 14, 15); + p1.v = tmp; +} +} // namespace detail + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_COMPLEX_CLANG_H diff --git a/Eigen/src/Core/arch/clang/PacketMath.h b/Eigen/src/Core/arch/clang/PacketMath.h index 55ddc1cbd..e142264b8 100644 --- a/Eigen/src/Core/arch/clang/PacketMath.h +++ b/Eigen/src/Core/arch/clang/PacketMath.h @@ -245,28 +245,30 @@ EIGEN_STRONG_INLINE void store_vector_aligned(scalar_type_of_vector_t* // --- Intrinsic-like specializations --- // --- Load/Store operations --- -#define EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PACKET_TYPE, SCALAR_TYPE) \ - template <> \ - EIGEN_STRONG_INLINE PACKET_TYPE ploadu(const SCALAR_TYPE* from) { \ - return detail::load_vector_unaligned(from); \ - } \ - template <> \ - EIGEN_STRONG_INLINE PACKET_TYPE pload(const SCALAR_TYPE* from) { \ - return detail::load_vector_aligned(from); \ - } \ - template <> \ - EIGEN_STRONG_INLINE void pstoreu(SCALAR_TYPE * to, const PACKET_TYPE& from) { \ - detail::store_vector_unaligned(to, from); \ - } \ - template <> \ - EIGEN_STRONG_INLINE void pstore(SCALAR_TYPE * to, const PACKET_TYPE& from) { \ - detail::store_vector_aligned(to, from); \ +#define EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PACKET_TYPE) \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE ploadu(const detail::scalar_type_of_vector_t* from) { \ + return detail::load_vector_unaligned(from); \ + } \ + template <> \ + EIGEN_STRONG_INLINE PACKET_TYPE pload(const detail::scalar_type_of_vector_t* from) { \ + return detail::load_vector_aligned(from); \ + } \ + template <> \ + EIGEN_STRONG_INLINE void pstoreu, PACKET_TYPE>( \ + detail::scalar_type_of_vector_t * to, const PACKET_TYPE& from) { \ + detail::store_vector_unaligned(to, from); \ + } \ + template <> \ + EIGEN_STRONG_INLINE void pstore, PACKET_TYPE>( \ + detail::scalar_type_of_vector_t * to, const PACKET_TYPE& from) { \ + detail::store_vector_aligned(to, from); \ } -EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet16f, float) -EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet8d, double) -EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet16i, int32_t) -EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet8l, int64_t) +EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet16f) +EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet8d) +EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet16i) +EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet8l) #undef EIGEN_CLANG_PACKET_LOAD_STORE_PACKET // --- Broadcast operation --- @@ -329,6 +331,10 @@ EIGEN_STRONG_INLINE Packet8d pcast_long_to_double(const Packet8l& a) { return re // Bitwise ops for integer packets #define EIGEN_CLANG_PACKET_BITWISE_INT(PACKET_TYPE) \ + template <> \ + constexpr EIGEN_STRONG_INLINE PACKET_TYPE pzero(const PACKET_TYPE& /*unused*/) { \ + return PACKET_TYPE(0); \ + } \ template <> \ constexpr EIGEN_STRONG_INLINE PACKET_TYPE ptrue(const PACKET_TYPE& /*unused*/) { \ return PACKET_TYPE(0) == PACKET_TYPE(0); \ @@ -370,8 +376,9 @@ EIGEN_CLANG_PACKET_BITWISE_INT(Packet8l) // Bitwise ops for floating point packets #define EIGEN_CLANG_PACKET_BITWISE_FLOAT(PACKET_TYPE, CAST_TO_INT, CAST_FROM_INT) \ template <> \ - EIGEN_STRONG_INLINE PACKET_TYPE ptrue(const PACKET_TYPE& a) { \ - return CAST_FROM_INT(CAST_TO_INT(a) == CAST_TO_INT(a)); \ + EIGEN_STRONG_INLINE PACKET_TYPE ptrue(const PACKET_TYPE& /* unused */) { \ + using Scalar = detail::scalar_type_of_vector_t; \ + return CAST_FROM_INT(PACKET_TYPE(Scalar(0)) == PACKET_TYPE(Scalar(0))); \ } \ template <> \ EIGEN_STRONG_INLINE PACKET_TYPE pand(const PACKET_TYPE& a, const PACKET_TYPE& b) { \ @@ -537,6 +544,7 @@ EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet16f) EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8d) EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet16i) EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8l) + #undef EIGEN_CLANG_PACKET_SCATTER_GATHER // ---- Various operations that depend on __builtin_shufflevector. @@ -653,6 +661,21 @@ EIGEN_STRONG_INLINE Packet8l plset(const int64_t& a) { return Packet8l{a + 0, a + 1, a + 2, a + 3, a + 4, a + 5, a + 6, a + 7}; } +template <> +EIGEN_STRONG_INLINE Packet16f peven_mask(const Packet16f& /* unused */) { + float kTrue = numext::bit_cast(int32_t(-1)); + float kFalse = 0.0f; + return Packet16f{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, + kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse}; +} + +template <> +EIGEN_STRONG_INLINE Packet8d peven_mask(const Packet8d& /* unused */) { + double kTrue = numext::bit_cast(int64_t(-1l)); + double kFalse = 0.0; + return Packet8d{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse}; +} + // Helpers for ptranspose. namespace detail { diff --git a/Eigen/src/Core/arch/clang/Reductions.h b/Eigen/src/Core/arch/clang/Reductions.h index d9cc5585d..1a6387a37 100644 --- a/Eigen/src/Core/arch/clang/Reductions.h +++ b/Eigen/src/Core/arch/clang/Reductions.h @@ -57,54 +57,100 @@ EIGEN_CLANG_PACKET_REDUX_INT(Packet8l) #if EIGEN_HAS_BUILTIN(__builtin_shufflevector) namespace detail { template -EIGEN_STRONG_INLINE scalar_type_of_vector_t ReduceAdd16(const VectorT& a) { - auto t1 = __builtin_shufflevector(a, a, 0, 2, 4, 6, 8, 10, 12, 14) + - __builtin_shufflevector(a, a, 1, 3, 5, 7, 9, 11, 13, 15); - auto t2 = __builtin_shufflevector(t1, t1, 0, 2, 4, 6) + __builtin_shufflevector(t1, t1, 1, 3, 5, 7); - auto t3 = __builtin_shufflevector(t2, t2, 0, 2) + __builtin_shufflevector(t2, t2, 1, 3); - return t3[0] + t3[1]; +EIGEN_STRONG_INLINE std::pair, scalar_type_of_vector_t> ReduceAdd16( + const VectorT& a) { + const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7) + + __builtin_shufflevector(a, a, 8, 9, 10, 11, 12, 13, 14, 15); + const auto t2 = __builtin_shufflevector(t1, t1, 0, 1, 2, 3) + __builtin_shufflevector(t1, t1, 4, 5, 6, 7); + const auto t3 = __builtin_shufflevector(t2, t2, 0, 1) + __builtin_shufflevector(t2, t2, 2, 3); + return {t3[0], t3[1]}; } template -EIGEN_STRONG_INLINE scalar_type_of_vector_t ReduceAdd8(const VectorT& a) { - auto t1 = __builtin_shufflevector(a, a, 0, 2, 4, 6) + __builtin_shufflevector(a, a, 1, 3, 5, 7); - auto t2 = __builtin_shufflevector(t1, t1, 0, 2) + __builtin_shufflevector(t1, t1, 1, 3); - return t2[0] + t2[1]; +EIGEN_STRONG_INLINE std::pair, scalar_type_of_vector_t> ReduceAdd8( + const VectorT& a) { + const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) + __builtin_shufflevector(a, a, 4, 5, 6, 7); + const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) + __builtin_shufflevector(t1, t1, 2, 3); + return {t2[0], t2[1]}; } template -EIGEN_STRONG_INLINE scalar_type_of_vector_t ReduceMul16(const VectorT& a) { - auto t1 = __builtin_shufflevector(a, a, 0, 2, 4, 6, 8, 10, 12, 14) * - __builtin_shufflevector(a, a, 1, 3, 5, 7, 9, 11, 13, 15); - auto t2 = __builtin_shufflevector(t1, t1, 0, 2, 4, 6) * __builtin_shufflevector(t1, t1, 1, 3, 5, 7); - auto t3 = __builtin_shufflevector(t2, t2, 0, 2) * __builtin_shufflevector(t2, t2, 1, 3); - return t3[0] * t3[1]; +EIGEN_STRONG_INLINE std::pair, scalar_type_of_vector_t> ReduceMul16( + const VectorT& a) { + const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7) * + __builtin_shufflevector(a, a, 8, 9, 10, 11, 12, 13, 14, 15); + const auto t2 = __builtin_shufflevector(t1, t1, 0, 1, 2, 3) * __builtin_shufflevector(t1, t1, 4, 5, 6, 7); + const auto t3 = __builtin_shufflevector(t2, t2, 0, 1) * __builtin_shufflevector(t2, t2, 2, 3); + return {t3[0], t3[1]}; } template -EIGEN_STRONG_INLINE scalar_type_of_vector_t ReduceMul8(const VectorT& a) { - auto t1 = __builtin_shufflevector(a, a, 0, 2, 4, 6) * __builtin_shufflevector(a, a, 1, 3, 5, 7); - auto t2 = __builtin_shufflevector(t1, t1, 0, 2) * __builtin_shufflevector(t1, t1, 1, 3); - return t2[0] * t2[1]; +EIGEN_STRONG_INLINE std::pair, scalar_type_of_vector_t> ReduceMul8( + const VectorT& a) { + const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) * __builtin_shufflevector(a, a, 4, 5, 6, 7); + const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) * __builtin_shufflevector(t1, t1, 2, 3); + return {t2[0], t2[1]}; } } // namespace detail template <> EIGEN_STRONG_INLINE float predux(const Packet16f& a) { - return detail::ReduceAdd16(a); + float even, odd; + std::tie(even, odd) = detail::ReduceAdd16(a); + return even + odd; } template <> EIGEN_STRONG_INLINE double predux(const Packet8d& a) { - return detail::ReduceAdd8(a); + double even, odd; + std::tie(even, odd) = detail::ReduceAdd8(a); + return even + odd; } template <> EIGEN_STRONG_INLINE float predux_mul(const Packet16f& a) { - return detail::ReduceMul16(a); + float even, odd; + std::tie(even, odd) = detail::ReduceMul16(a); + return even * odd; } template <> EIGEN_STRONG_INLINE double predux_mul(const Packet8d& a) { - return detail::ReduceMul8(a); + double even, odd; + std::tie(even, odd) = detail::ReduceMul8(a); + return even * odd; } + +template <> +EIGEN_STRONG_INLINE std::complex predux(const Packet8cf& a) { + float re, im; + std::tie(re, im) = detail::ReduceAdd16(a.v); + return std::complex(re, im); +} + +template <> +EIGEN_STRONG_INLINE std::complex predux(const Packet4cd& a) { + double re, im; + std::tie(re, im) = detail::ReduceAdd8(a.v); + return std::complex(re, im); +} + +template <> +EIGEN_STRONG_INLINE std::complex predux_mul(const Packet8cf& a) { + const Packet4cf lower4 = Packet4cf(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3, 4, 5, 6, 7)); + const Packet4cf upper4 = Packet4cf(__builtin_shufflevector(a.v, a.v, 8, 9, 10, 11, 12, 13, 14, 15)); + const Packet4cf prod4 = pmul(lower4, upper4); + const Packet2cf lower2 = Packet2cf(__builtin_shufflevector(prod4.v, prod4.v, 0, 1, 2, 3)); + const Packet2cf upper2 = Packet2cf(__builtin_shufflevector(prod4.v, prod4.v, 4, 5, 6, 7)); + const Packet2cf prod2 = pmul(lower2, upper2); + return prod2[0] * prod2[1]; +} + +template <> +EIGEN_STRONG_INLINE std::complex predux_mul(const Packet4cd& a) { + const Packet2cd lower2 = Packet2cd(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3)); + const Packet2cd upper2 = Packet2cd(__builtin_shufflevector(a.v, a.v, 4, 5, 6, 7)); + const Packet2cd prod2 = pmul(lower2, upper2); + return prod2[0] * prod2[1]; +} + #endif } // end namespace internal