0
0
mirror of https://gitlab.com/libeigen/eigen.git synced 2026-01-18 17:31:19 +01:00

Merge remote-tracking branch 'origin'

This commit is contained in:
Chip Kerchner
2025-11-12 15:08:17 +00:00
27 changed files with 609 additions and 494 deletions

View File

@@ -2,6 +2,21 @@
## [Unreleased]
New features:
- ComplexQZ implementation [!1962]
- Generic clang vector extension backend [!2051]
## [5.0.1] - 2025-11-11
A few bug-fixes from the master branch, including
- Dirty git state [#2995]
- Failing geo_homogeneous tests [#2977]
- Alignment issues [#2982, #2984]
- Missing C++20 `<version>` header [#2986]
- BLAS/LAPACK build on windows [#2980]
See the full lists of [addressed bugs](https://gitlab.com/libeigen/eigen/-/issues?state=all&label_name%5B%5D=release%3A%3A5.0.1) and [merge requests](https://gitlab.com/libeigen/eigen/-/merge_requests?state=all&label_name%5B%5D=release%3A%3A5.0.1) for more details.
## [5.0.0] - 2025-09-30
Eigen 5.0 provides many new features, performance enhancements, and bugfixes throughout Eigens core template expression infrastructure and linear algebra facilities. The full set of changes and related issues are too large to list here, but can be accessed via the release milestone %"5.0".

View File

@@ -367,11 +367,11 @@ class DenseBase
EIGEN_DEVICE_FUNC inline bool allFinite() const;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator*=(const Scalar& other);
template <bool Enable = !internal::is_same<Scalar, RealScalar>::value, typename = std::enable_if_t<Enable>>
template <bool Enable = internal::complex_array_access<Scalar>::value, typename = std::enable_if_t<Enable>>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator*=(const RealScalar& other);
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator/=(const Scalar& other);
template <bool Enable = !internal::is_same<Scalar, RealScalar>::value, typename = std::enable_if_t<Enable>>
template <bool Enable = internal::complex_array_access<Scalar>::value, typename = std::enable_if_t<Enable>>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator/=(const RealScalar& other);
typedef internal::add_const_on_value_type_t<typename internal::eval<Derived>::type> EvalReturnType;

View File

@@ -20,10 +20,7 @@ namespace internal {
template <typename Derived, typename Scalar = typename traits<Derived>::Scalar>
struct squared_norm_impl {
using Real = typename NumTraits<Scalar>::Real;
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Real run(const Derived& a) {
Scalar result = a.unaryExpr(squared_norm_functor<Scalar>()).sum();
return numext::real(result) + numext::imag(result);
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Real run(const Derived& a) { return a.realView().cwiseAbs2().sum(); }
};
template <typename Derived>

View File

@@ -57,12 +57,12 @@ struct default_packet_traits {
HasConj = 1,
HasSetLinear = 1,
HasSign = 1,
HasAbsDiff = 1,
// By default, the nearest integer functions (rint, round, floor, ceil, trunc) are enabled for all scalar and packet
// types
HasRound = 1,
HasArg = 0,
HasAbsDiff = 0,
// This flag is used to indicate whether packet comparison is supported.
// pcmp_eq and pcmp_lt should be defined for it to be true.
HasCmp = 0,
@@ -116,6 +116,7 @@ struct packet_traits : default_packet_traits {
enum {
HasAdd = 0,
HasSub = 0,
HasAbsDiff = 0,
HasMul = 0,
HasNegate = 0,
HasAbs = 0,
@@ -130,17 +131,18 @@ struct packet_traits : default_packet_traits {
template <typename T>
struct packet_traits<const T> : packet_traits<T> {};
struct default_unpacket_traits {
enum { vectorizable = false, masked_load_available = false, masked_store_available = false };
};
template <typename T>
struct unpacket_traits {
struct unpacket_traits : default_unpacket_traits {
typedef T type;
typedef T half;
typedef typename numext::get_integer_by_size<sizeof(T)>::signed_type integer_packet;
enum {
size = 1,
alignment = alignof(T),
vectorizable = false,
masked_load_available = false,
masked_store_available = false
};
};
@@ -747,9 +749,15 @@ EIGEN_DEVICE_FUNC inline Packet pldexp(const Packet& a, const Packet& exponent)
/** \internal \returns the min of \a a and \a b (coeff-wise) */
template <typename Packet>
EIGEN_DEVICE_FUNC inline Packet pabsdiff(const Packet& a, const Packet& b) {
EIGEN_DEVICE_FUNC inline std::enable_if_t<NumTraits<typename unpacket_traits<Packet>::type>::IsInteger, Packet>
pabsdiff(const Packet& a, const Packet& b) {
return pselect(pcmp_lt(a, b), psub(b, a), psub(a, b));
}
template <typename Packet>
EIGEN_DEVICE_FUNC inline std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsInteger, Packet>
pabsdiff(const Packet& a, const Packet& b) {
return pabs(psub(a, b));
}
/** \internal \returns a packet version of \a *from, from must be properly aligned */
template <typename Packet>

View File

@@ -37,15 +37,16 @@ struct generic_reciprocal_newton_step {
static_assert(Steps > 0, "Steps must be at least 1.");
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(const Packet& a, const Packet& approx_a_recip) {
using Scalar = typename unpacket_traits<Packet>::type;
const Packet two = pset1<Packet>(Scalar(2));
const Packet one = pset1<Packet>(Scalar(1));
// Refine the approximation using one Newton-Raphson step:
// x_{i} = x_{i-1} * (2 - a * x_{i-1})
const Packet x = generic_reciprocal_newton_step<Packet, Steps - 1>::run(a, approx_a_recip);
const Packet tmp = pnmadd(a, x, two);
const Packet tmp = pnmadd(a, x, one);
// If tmp is NaN, it means that a is either +/-0 or +/-Inf.
// In this case return the approximation directly.
const Packet is_not_nan = pcmp_eq(tmp, tmp);
return pselect(is_not_nan, pmul(x, tmp), x);
// Use two FMAs instead of FMA+FMUL to improve precision.
return pselect(is_not_nan, pmadd(x, tmp, x), x);
}
};

View File

@@ -17,20 +17,16 @@ namespace Eigen {
namespace internal {
// Vectorized assignment to RealView requires array-oriented access to the real and imaginary components.
// Write access and vectorization requires array-oriented access to the real and imaginary components.
// From https://en.cppreference.com/w/cpp/numeric/complex.html:
// For any pointer to an element of an array of std::complex<T> named p and any valid array index i,
// reinterpret_cast<T*>(p)[2 * i] is the real part of the complex number p[i], and
// reinterpret_cast<T*>(p)[2 * i + 1] is the imaginary part of the complex number p[i].
template <typename ComplexScalar>
template <typename T>
struct complex_array_access : std::false_type {};
template <>
struct complex_array_access<std::complex<float>> : std::true_type {};
template <>
struct complex_array_access<std::complex<double>> : std::true_type {};
template <>
struct complex_array_access<std::complex<long double>> : std::true_type {};
template <typename T>
struct complex_array_access<std::complex<T>> : std::true_type {};
template <typename Xpr>
struct traits<RealView<Xpr>> : public traits<Xpr> {
@@ -40,13 +36,17 @@ struct traits<RealView<Xpr>> : public traits<Xpr> {
if (size_as_int == Dynamic) return Dynamic;
return times_two ? (2 * size_as_int) : size_as_int;
}
using Base = traits<Xpr>;
using ComplexScalar = typename Base::Scalar;
using Scalar = typename NumTraits<ComplexScalar>::Real;
static constexpr int ActualDirectAccessBit = complex_array_access<ComplexScalar>::value ? DirectAccessBit : 0;
static constexpr bool ArrayAccess = complex_array_access<ComplexScalar>::value;
static constexpr int ActualDirectAccessBit = ArrayAccess ? DirectAccessBit : 0;
static constexpr int ActualLvaluebit = !std::is_const<Xpr>::value && ArrayAccess ? LvalueBit : 0;
static constexpr int ActualPacketAccessBit = packet_traits<Scalar>::Vectorizable ? PacketAccessBit : 0;
static constexpr int FlagMask =
ActualDirectAccessBit | ActualPacketAccessBit | HereditaryBits | LinearAccessBit | LvalueBit;
ActualDirectAccessBit | ActualLvaluebit | ActualPacketAccessBit | HereditaryBits | LinearAccessBit;
static constexpr int BaseFlags = int(evaluator<Xpr>::Flags) | int(Base::Flags);
static constexpr int Flags = BaseFlags & FlagMask;
static constexpr bool IsRowMajor = Flags & RowMajorBit;
@@ -66,68 +66,84 @@ struct evaluator<RealView<Xpr>> : private evaluator<Xpr> {
using XprType = RealView<Xpr>;
using ExpressionTraits = traits<XprType>;
using ComplexScalar = typename ExpressionTraits::ComplexScalar;
using ComplexCoeffReturnType = typename BaseEvaluator::CoeffReturnType;
using Scalar = typename ExpressionTraits::Scalar;
static constexpr bool IsRowMajor = ExpressionTraits::IsRowMajor;
static constexpr int Flags = ExpressionTraits::Flags;
static constexpr int CoeffReadCost = BaseEvaluator::CoeffReadCost;
static constexpr int Alignment = BaseEvaluator::Alignment;
static constexpr bool IsRowMajor = ExpressionTraits::IsRowMajor;
static constexpr bool DirectAccess = Flags & DirectAccessBit;
using ComplexCoeffReturnType = std::conditional_t<DirectAccess, const ComplexScalar&, ComplexScalar>;
using CoeffReturnType = std::conditional_t<DirectAccess, const Scalar&, Scalar>;
EIGEN_DEVICE_FUNC explicit evaluator(XprType realView) : BaseEvaluator(realView.m_xpr) {}
template <bool Enable = std::is_reference<ComplexCoeffReturnType>::value, typename = std::enable_if_t<!Enable>>
template <bool Enable = DirectAccess, std::enable_if_t<!Enable, bool> = true>
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const {
ComplexCoeffReturnType cscalar = BaseEvaluator::coeff(IsRowMajor ? row : row / 2, IsRowMajor ? col / 2 : col);
Index p = (IsRowMajor ? col : row) & 1;
return p ? numext::real(cscalar) : numext::imag(cscalar);
Index r = IsRowMajor ? row : row / 2;
Index c = IsRowMajor ? col / 2 : col;
bool p = (IsRowMajor ? col : row) & 1;
ComplexScalar ccoeff = BaseEvaluator::coeff(r, c);
return p ? numext::imag(ccoeff) : numext::real(ccoeff);
}
template <bool Enable = std::is_reference<ComplexCoeffReturnType>::value, typename = std::enable_if_t<Enable>>
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index row, Index col) const {
ComplexCoeffReturnType cscalar = BaseEvaluator::coeff(IsRowMajor ? row : row / 2, IsRowMajor ? col / 2 : col);
template <bool Enable = DirectAccess, std::enable_if_t<Enable, bool> = true>
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
Index r = IsRowMajor ? row : row / 2;
Index c = IsRowMajor ? col / 2 : col;
Index p = (IsRowMajor ? col : row) & 1;
return reinterpret_cast<const Scalar(&)[2]>(cscalar)[p];
ComplexCoeffReturnType ccoeff = BaseEvaluator::coeff(r, c);
return reinterpret_cast<const Scalar(&)[2]>(ccoeff)[p];
}
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
ComplexScalar& cscalar = BaseEvaluator::coeffRef(IsRowMajor ? row : row / 2, IsRowMajor ? col / 2 : col);
Index p = (IsRowMajor ? col : row) & 1;
return reinterpret_cast<Scalar(&)[2]>(cscalar)[p];
}
template <bool Enable = std::is_reference<ComplexCoeffReturnType>::value, typename = std::enable_if_t<!Enable>>
template <bool Enable = DirectAccess, std::enable_if_t<!Enable, bool> = true>
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index index) const {
ComplexCoeffReturnType cscalar = BaseEvaluator::coeff(index / 2);
Index p = index & 1;
return p ? numext::real(cscalar) : numext::imag(cscalar);
ComplexScalar ccoeff = BaseEvaluator::coeff(index / 2);
bool p = index & 1;
return p ? numext::imag(ccoeff) : numext::real(ccoeff);
}
template <bool Enable = std::is_reference<ComplexCoeffReturnType>::value, typename = std::enable_if_t<Enable>>
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const {
ComplexCoeffReturnType cscalar = BaseEvaluator::coeff(index / 2);
template <bool Enable = DirectAccess, std::enable_if_t<Enable, bool> = true>
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
ComplexCoeffReturnType ccoeff = BaseEvaluator::coeff(index / 2);
Index p = index & 1;
return reinterpret_cast<const Scalar(&)[2]>(cscalar)[p];
return reinterpret_cast<const Scalar(&)[2]>(ccoeff)[p];
}
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
Index r = IsRowMajor ? row : row / 2;
Index c = IsRowMajor ? col / 2 : col;
Index p = (IsRowMajor ? col : row) & 1;
ComplexScalar& ccoeffRef = BaseEvaluator::coeffRef(r, c);
return reinterpret_cast<Scalar(&)[2]>(ccoeffRef)[p];
}
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
ComplexScalar& cscalar = BaseEvaluator::coeffRef(index / 2);
ComplexScalar& ccoeffRef = BaseEvaluator::coeffRef(index / 2);
Index p = index & 1;
return reinterpret_cast<Scalar(&)[2]>(cscalar)[p];
return reinterpret_cast<Scalar(&)[2]>(ccoeffRef)[p];
}
// If the first index is odd (imaginary), discard the first scalar
// in 'result' and assign the missing scalar.
// This operation is safe as the real component of the first scalar must exist.
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
constexpr int RealPacketSize = unpacket_traits<PacketType>::size;
using ComplexPacket = typename find_packet_by_size<ComplexScalar, RealPacketSize / 2>::type;
EIGEN_STATIC_ASSERT((find_packet_by_size<ComplexScalar, RealPacketSize / 2>::value),
MISSING COMPATIBLE COMPLEX PACKET TYPE)
eigen_assert(((IsRowMajor ? col : row) % 2 == 0) && "the inner index must be even");
Index crow = IsRowMajor ? row : row / 2;
Index ccol = IsRowMajor ? col / 2 : col;
ComplexPacket cpacket = BaseEvaluator::template packet<LoadMode, ComplexPacket>(crow, ccol);
return preinterpret<PacketType, ComplexPacket>(cpacket);
Index r = IsRowMajor ? row : row / 2;
Index c = IsRowMajor ? col / 2 : col;
bool p = (IsRowMajor ? col : row) & 1;
ComplexPacket cresult = BaseEvaluator::template packet<LoadMode, ComplexPacket>(r, c);
PacketType result = preinterpret<PacketType>(cresult);
if (p) {
Scalar aux[RealPacketSize + 1];
pstoreu(aux, result);
Index lastr = IsRowMajor ? row : row + RealPacketSize - 1;
Index lastc = IsRowMajor ? col + RealPacketSize - 1 : col;
aux[RealPacketSize] = coeff(lastr, lastc);
result = ploadu<PacketType>(aux + 1);
}
return result;
}
template <int LoadMode, typename PacketType>
@@ -136,28 +152,48 @@ struct evaluator<RealView<Xpr>> : private evaluator<Xpr> {
using ComplexPacket = typename find_packet_by_size<ComplexScalar, RealPacketSize / 2>::type;
EIGEN_STATIC_ASSERT((find_packet_by_size<ComplexScalar, RealPacketSize / 2>::value),
MISSING COMPATIBLE COMPLEX PACKET TYPE)
eigen_assert((index % 2 == 0) && "the index must be even");
Index cindex = index / 2;
ComplexPacket cpacket = BaseEvaluator::template packet<LoadMode, ComplexPacket>(cindex);
return preinterpret<PacketType, ComplexPacket>(cpacket);
ComplexPacket cresult = BaseEvaluator::template packet<LoadMode, ComplexPacket>(index / 2);
PacketType result = preinterpret<PacketType>(cresult);
bool p = index & 1;
if (p) {
Scalar aux[RealPacketSize + 1];
pstoreu(aux, result);
aux[RealPacketSize] = coeff(index + RealPacketSize - 1);
result = ploadu<PacketType>(aux + 1);
}
return result;
}
// The requested real packet segment forms the half-open interval [begin, end), where 'end' = 'begin' + 'count'.
// In order to access the underlying complex array, even indices must be aligned with the real components
// of the complex scalars. 'begin' and 'count' must be modified as follows:
// a) 'begin' must be rounded down to the nearest even number; and
// b) 'end' must be rounded up to the nearest even number.
template <int LoadMode, typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
constexpr int RealPacketSize = unpacket_traits<PacketType>::size;
using ComplexPacket = typename find_packet_by_size<ComplexScalar, RealPacketSize / 2>::type;
EIGEN_STATIC_ASSERT((find_packet_by_size<ComplexScalar, RealPacketSize / 2>::value),
MISSING COMPATIBLE COMPLEX PACKET TYPE)
eigen_assert(((IsRowMajor ? col : row) % 2 == 0) && "the inner index must be even");
eigen_assert((begin % 2 == 0) && (count % 2 == 0) && "begin and count must be even");
Index crow = IsRowMajor ? row : row / 2;
Index ccol = IsRowMajor ? col / 2 : col;
Index cbegin = begin / 2;
Index ccount = count / 2;
ComplexPacket cpacket = BaseEvaluator::template packetSegment<LoadMode, ComplexPacket>(crow, ccol, cbegin, ccount);
return preinterpret<PacketType, ComplexPacket>(cpacket);
Index actualBegin = numext::round_down(begin, 2);
Index actualEnd = numext::round_down(begin + count + 1, 2);
Index actualCount = actualEnd - actualBegin;
Index r = IsRowMajor ? row : row / 2;
Index c = IsRowMajor ? col / 2 : col;
ComplexPacket cresult =
BaseEvaluator::template packetSegment<LoadMode, ComplexPacket>(r, c, actualBegin / 2, actualCount / 2);
PacketType result = preinterpret<PacketType>(cresult);
bool p = (IsRowMajor ? col : row) & 1;
if (p) {
Scalar aux[RealPacketSize + 1] = {};
pstoreu(aux, result);
Index lastr = IsRowMajor ? row : row + actualEnd - 1;
Index lastc = IsRowMajor ? col + actualEnd - 1 : col;
aux[actualEnd] = coeff(lastr, lastc);
result = ploadu<PacketType>(aux + 1);
}
return result;
}
template <int LoadMode, typename PacketType>
@@ -166,14 +202,20 @@ struct evaluator<RealView<Xpr>> : private evaluator<Xpr> {
using ComplexPacket = typename find_packet_by_size<ComplexScalar, RealPacketSize / 2>::type;
EIGEN_STATIC_ASSERT((find_packet_by_size<ComplexScalar, RealPacketSize / 2>::value),
MISSING COMPATIBLE COMPLEX PACKET TYPE)
eigen_assert((index % 2 == 0) && "the index must be even");
eigen_assert((begin % 2 == 0) && (count % 2 == 0) && "begin and count must be even");
Index cindex = index / 2;
Index cbegin = begin / 2;
Index ccount = count / 2;
ComplexPacket cpacket = BaseEvaluator::template packetSegment<LoadMode, ComplexPacket>(cindex, cbegin, ccount);
return preinterpret<PacketType, ComplexPacket>(cpacket);
Index actualBegin = numext::round_down(begin, 2);
Index actualEnd = numext::round_down(begin + count + 1, 2);
Index actualCount = actualEnd - actualBegin;
ComplexPacket cresult =
BaseEvaluator::template packetSegment<LoadMode, ComplexPacket>(index / 2, actualBegin / 2, actualCount / 2);
PacketType result = preinterpret<PacketType>(cresult);
bool p = index & 1;
if (p) {
Scalar aux[RealPacketSize + 1] = {};
pstoreu(aux, result);
aux[actualEnd] = coeff(index + actualEnd - 1);
result = ploadu<PacketType>(aux + 1);
}
return result;
}
};
@@ -211,7 +253,7 @@ class RealView : public internal::dense_xpr_base<RealView<Xpr>>::type {
EIGEN_DEVICE_FUNC RealView& operator=(const DenseBase<OtherDerived>& other);
protected:
friend struct internal::evaluator<RealView<Xpr>>;
friend struct internal::evaluator<RealView>;
Xpr& m_xpr;
};

View File

@@ -250,7 +250,6 @@ struct packet_traits<uint32_t> : default_packet_traits {
HasDiv = 0,
HasNegate = 0,
HasSqrt = 0,
HasCmp = 1,
HasMin = 1,
@@ -277,12 +276,9 @@ struct packet_traits<uint64_t> : default_packet_traits {
AlignedOnScalar = 1,
size = 4,
// HasMin = 0,
// HasMax = 0,
HasDiv = 0,
HasTranspose = 0,
HasNegate = 0,
HasSqrt = 0,
HasCmp = 1,
HasShift = 1
};

View File

@@ -81,10 +81,10 @@ class gemm_class {
Index m;
const Index n, k, ldc;
const Index inc;
const Scalar *alpha;
const Scalar* alpha;
const Scalar *a, *b;
Scalar *c;
Scalar* c;
const bool is_alpha1;
const bool is_beta0;
@@ -92,26 +92,26 @@ class gemm_class {
const Index a_stride, b_stride;
const Index a_off, b_off;
EIGEN_ALWAYS_INLINE void prefetch_a(const Scalar *a_addr) {
_mm_prefetch((char *)(a_prefetch_size + a_addr - a_shift), _MM_HINT_T0);
EIGEN_ALWAYS_INLINE void prefetch_a(const Scalar* a_addr) {
_mm_prefetch((char*)(a_prefetch_size + a_addr - a_shift), _MM_HINT_T0);
}
EIGEN_ALWAYS_INLINE void prefetch_b(const Scalar *b_addr) {
_mm_prefetch((char *)(b_prefetch_size + b_addr - b_shift), _MM_HINT_T0);
EIGEN_ALWAYS_INLINE void prefetch_b(const Scalar* b_addr) {
_mm_prefetch((char*)(b_prefetch_size + b_addr - b_shift), _MM_HINT_T0);
}
EIGEN_ALWAYS_INLINE void prefetch_x(const Scalar *x_addr) { _mm_prefetch((char *)(x_addr - a_shift), _MM_HINT_T2); }
EIGEN_ALWAYS_INLINE void prefetch_x(const Scalar* x_addr) { _mm_prefetch((char*)(x_addr - a_shift), _MM_HINT_T2); }
EIGEN_ALWAYS_INLINE void prefetch_c(const Scalar *c_addr) {
EIGEN_ALWAYS_INLINE void prefetch_c(const Scalar* c_addr) {
#if defined(__PRFCHW__) && __PRFCHW__ == 1
_m_prefetchw((void *)c_addr);
_m_prefetchw((void*)c_addr);
#else
_mm_prefetch((char *)c_addr, _MM_HINT_T0);
_mm_prefetch((char*)c_addr, _MM_HINT_T0);
#endif
}
template <int nelems>
EIGEN_ALWAYS_INLINE void a_load(vec &a_reg, const Scalar *a_addr) {
EIGEN_ALWAYS_INLINE void a_load(vec& a_reg, const Scalar* a_addr) {
switch (nelems * sizeof(*a_addr) * 8) {
default:
case 512 * 3:
@@ -124,13 +124,13 @@ class gemm_class {
a_reg = ploadu<vec>(a_addr);
break;
case 256 * 1:
a_reg = preinterpret<vec>(_mm512_broadcast_f64x4(ploadu<Packet4d>(reinterpret_cast<const double *>(a_addr))));
a_reg = preinterpret<vec>(_mm512_broadcast_f64x4(ploadu<Packet4d>(reinterpret_cast<const double*>(a_addr))));
break;
case 128 * 1:
a_reg = preinterpret<vec>(_mm512_broadcast_f32x4(ploadu<Packet4f>(reinterpret_cast<const float *>(a_addr))));
a_reg = preinterpret<vec>(_mm512_broadcast_f32x4(ploadu<Packet4f>(reinterpret_cast<const float*>(a_addr))));
break;
case 64 * 1:
a_reg = preinterpret<vec>(pload1<Packet8d>(reinterpret_cast<const double *>(a_addr)));
a_reg = preinterpret<vec>(pload1<Packet8d>(reinterpret_cast<const double*>(a_addr)));
break;
case 32 * 1:
a_reg = pload1<vec>(a_addr);
@@ -138,10 +138,10 @@ class gemm_class {
}
}
EIGEN_ALWAYS_INLINE void b_load(vec &b_reg, const Scalar *b_addr) { b_reg = pload1<vec>(b_addr); }
EIGEN_ALWAYS_INLINE void b_load(vec& b_reg, const Scalar* b_addr) { b_reg = pload1<vec>(b_addr); }
template <int nelems>
EIGEN_ALWAYS_INLINE void c_store(Scalar *mem, vec &src) {
EIGEN_ALWAYS_INLINE void c_store(Scalar* mem, vec& src) {
if (is_unit_inc) {
switch (nelems * sizeof(*mem) * 8) {
default:
@@ -196,7 +196,7 @@ class gemm_class {
}
template <int nelems>
EIGEN_ALWAYS_INLINE void vaddm(vec &dst, const Scalar *mem, vec &src, vec &reg) {
EIGEN_ALWAYS_INLINE void vaddm(vec& dst, const Scalar* mem, vec& src, vec& reg) {
if (is_unit_inc) {
switch (nelems * sizeof(*mem) * 8) {
default:
@@ -263,7 +263,7 @@ class gemm_class {
}
}
EIGEN_STRONG_INLINE void vfmadd(vec &dst, const vec &src1, const vec &src2) {
EIGEN_STRONG_INLINE void vfmadd(vec& dst, const vec& src1, const vec& src2) {
dst = pmadd(src1, src2, dst);
#if (EIGEN_COMP_GNUC != 0) || (EIGEN_COMP_CLANG != 0)
@@ -273,7 +273,7 @@ class gemm_class {
}
template <int nelems>
EIGEN_ALWAYS_INLINE void vfmaddm(vec &dst, const Scalar *mem, vec &src, vec &scale, vec &reg) {
EIGEN_ALWAYS_INLINE void vfmaddm(vec& dst, const Scalar* mem, vec& src, vec& scale, vec& reg) {
if (is_unit_inc) {
switch (nelems * sizeof(*mem) * 8) {
default:
@@ -350,16 +350,16 @@ class gemm_class {
}
template <int j, int endX, int i, int endY, int nelems>
EIGEN_ALWAYS_INLINE std::enable_if_t<(j > endX) || (i > endY)> a_loads(const Scalar *ao) {
EIGEN_ALWAYS_INLINE std::enable_if_t<(j > endX) || (i > endY)> a_loads(const Scalar* ao) {
EIGEN_UNUSED_VARIABLE(ao);
}
template <int j, int endX, int i, int endY, int nelems>
EIGEN_ALWAYS_INLINE std::enable_if_t<(j <= endX) && (i <= endY)> a_loads(const Scalar *ao) {
EIGEN_ALWAYS_INLINE std::enable_if_t<(j <= endX) && (i <= endY)> a_loads(const Scalar* ao) {
if (j < endX) {
if (i < endY) {
auto &a_reg = zmm[a_regs[i + (j % 2) * 3]];
const Scalar *a_addr = ao + nelems * j + nelems_in_cache_line * i - a_shift;
auto& a_reg = zmm[a_regs[i + (j % 2) * 3]];
const Scalar* a_addr = ao + nelems * j + nelems_in_cache_line * i - a_shift;
a_load<nelems>(a_reg, a_addr);
a_loads<j, endX, i + 1, endY, nelems>(ao);
@@ -370,8 +370,8 @@ class gemm_class {
}
template <int un, int max_b_unroll, int i, int um_vecs, int a_unroll, int b_unroll>
EIGEN_ALWAYS_INLINE std::enable_if_t<(un > max_b_unroll) || (i > um_vecs)> prefetch_cs(const Scalar *co1,
const Scalar *co2) {
EIGEN_ALWAYS_INLINE std::enable_if_t<(un > max_b_unroll) || (i > um_vecs)> prefetch_cs(const Scalar* co1,
const Scalar* co2) {
EIGEN_UNUSED_VARIABLE(co1);
EIGEN_UNUSED_VARIABLE(co2);
}
@@ -391,13 +391,13 @@ class gemm_class {
*/
template <int un, int max_b_unroll, int i, int um_vecs, int a_unroll, int b_unroll>
EIGEN_ALWAYS_INLINE std::enable_if_t<(un <= max_b_unroll) && (i <= um_vecs)> prefetch_cs(Scalar *&co1, Scalar *&co2) {
EIGEN_ALWAYS_INLINE std::enable_if_t<(un <= max_b_unroll) && (i <= um_vecs)> prefetch_cs(Scalar*& co1, Scalar*& co2) {
if (un < max_b_unroll) {
if (b_unroll >= un + 1) {
if (un == 4 && i == 0) co2 = co1 + 4 * ldc;
if (i < um_vecs) {
Scalar *co = (un + 1 <= 4) ? co1 : co2;
Scalar* co = (un + 1 <= 4) ? co1 : co2;
auto co_off = (un % 4) * ldc + a_unroll - 1 + i * nelems_in_cache_line * sizeof *co;
prefetch_c(co + co_off);
@@ -414,16 +414,16 @@ class gemm_class {
// load_c
template <int i, int um_vecs, int idx, int nelems>
EIGEN_ALWAYS_INLINE std::enable_if_t<(i > um_vecs)> scale_load_c(const Scalar *cox, vec &alpha_reg) {
EIGEN_ALWAYS_INLINE std::enable_if_t<(i > um_vecs)> scale_load_c(const Scalar* cox, vec& alpha_reg) {
EIGEN_UNUSED_VARIABLE(cox);
EIGEN_UNUSED_VARIABLE(alpha_reg);
}
template <int i, int um_vecs, int idx, int nelems>
EIGEN_ALWAYS_INLINE std::enable_if_t<(i <= um_vecs)> scale_load_c(const Scalar *cox, vec &alpha_reg) {
EIGEN_ALWAYS_INLINE std::enable_if_t<(i <= um_vecs)> scale_load_c(const Scalar* cox, vec& alpha_reg) {
if (i < um_vecs) {
auto &c_reg = zmm[c_regs[i + idx * 3]];
auto &c_load_reg = zmm[c_load_regs[i % 3]];
auto& c_reg = zmm[c_regs[i + idx * 3]];
auto& c_load_reg = zmm[c_load_regs[i % 3]];
auto c_mem = cox;
if (is_unit_inc)
c_mem += i * nelems_in_cache_line;
@@ -443,14 +443,14 @@ class gemm_class {
// store_c
template <int i, int um_vecs, int idx, int nelems>
EIGEN_ALWAYS_INLINE std::enable_if_t<(i > um_vecs)> write_c(Scalar *cox) {
EIGEN_ALWAYS_INLINE std::enable_if_t<(i > um_vecs)> write_c(Scalar* cox) {
EIGEN_UNUSED_VARIABLE(cox);
}
template <int i, int um_vecs, int idx, int nelems>
EIGEN_ALWAYS_INLINE std::enable_if_t<(i <= um_vecs)> write_c(Scalar *cox) {
EIGEN_ALWAYS_INLINE std::enable_if_t<(i <= um_vecs)> write_c(Scalar* cox) {
if (i < um_vecs) {
auto &c_reg = zmm[c_regs[i + idx * 3]];
auto& c_reg = zmm[c_regs[i + idx * 3]];
auto c_mem = cox;
if (is_unit_inc)
c_mem += i * nelems_in_cache_line;
@@ -495,20 +495,20 @@ class gemm_class {
*/
template <int pow, int a_unroll, int idx>
EIGEN_ALWAYS_INLINE void c_update_1count(Scalar *&cox) {
EIGEN_ALWAYS_INLINE void c_update_1count(Scalar*& cox) {
if (pow >= 4) cox += ldc;
const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line);
auto &alpha_reg = zmm[alpha_load_reg];
auto& alpha_reg = zmm[alpha_load_reg];
scale_load_c<0, um_vecs, idx, a_unroll>(cox, alpha_reg);
write_c<0, um_vecs, idx, a_unroll>(cox);
}
template <int pow, int a_unroll>
EIGEN_ALWAYS_INLINE void c_update_1pow(Scalar *&co1, Scalar *&co2) {
EIGEN_ALWAYS_INLINE void c_update_1pow(Scalar*& co1, Scalar*& co2) {
constexpr int idx = pow / 2;
Scalar *&cox = idx == 0 ? co1 : co2;
Scalar*& cox = idx == 0 ? co1 : co2;
constexpr int max_count = (pow + 1) / 2;
static_assert(max_count <= 4, "Unsupported max_count.");
@@ -520,8 +520,8 @@ class gemm_class {
}
template <int max_b_unroll, int a_unroll, int b_unroll>
EIGEN_ALWAYS_INLINE void c_update(Scalar *&co1, Scalar *&co2) {
auto &alpha_reg = zmm[alpha_load_reg];
EIGEN_ALWAYS_INLINE void c_update(Scalar*& co1, Scalar*& co2) {
auto& alpha_reg = zmm[alpha_load_reg];
co2 = co1 + ldc;
if (!is_alpha1) alpha_reg = pload1<vec>(alpha);
@@ -542,8 +542,8 @@ class gemm_class {
// compute
template <int um, int um_vecs, int idx, int uk, bool fetch_x, bool ktail>
EIGEN_ALWAYS_INLINE std::enable_if_t<(um > um_vecs)> compute(const Scalar *ao, const Scalar *bo, int &fetchA_idx,
int &fetchB_idx, vec &b_reg) {
EIGEN_ALWAYS_INLINE std::enable_if_t<(um > um_vecs)> compute(const Scalar* ao, const Scalar* bo, int& fetchA_idx,
int& fetchB_idx, vec& b_reg) {
EIGEN_UNUSED_VARIABLE(ao);
EIGEN_UNUSED_VARIABLE(bo);
EIGEN_UNUSED_VARIABLE(fetchA_idx);
@@ -552,11 +552,11 @@ class gemm_class {
}
template <int um, int um_vecs, int idx, int uk, bool fetch_x, bool ktail>
EIGEN_ALWAYS_INLINE std::enable_if_t<(um <= um_vecs)> compute(const Scalar *ao, const Scalar *bo, int &fetchA_idx,
int &fetchB_idx, vec &b_reg) {
EIGEN_ALWAYS_INLINE std::enable_if_t<(um <= um_vecs)> compute(const Scalar* ao, const Scalar* bo, int& fetchA_idx,
int& fetchB_idx, vec& b_reg) {
if (um < um_vecs) {
auto &c_reg = zmm[c_regs[um + idx * 3]];
auto &a_reg = zmm[a_regs[um + (uk % 2) * 3]];
auto& c_reg = zmm[c_regs[um + idx * 3]];
auto& a_reg = zmm[a_regs[um + (uk % 2) * 3]];
vfmadd(c_reg, a_reg, b_reg);
@@ -578,25 +578,25 @@ class gemm_class {
// load_a
template <int um, int um_vecs, int uk, int nelems, bool ktail>
EIGEN_ALWAYS_INLINE std::enable_if_t<(um > um_vecs)> load_a(const Scalar *ao) {
EIGEN_ALWAYS_INLINE std::enable_if_t<(um > um_vecs)> load_a(const Scalar* ao) {
EIGEN_UNUSED_VARIABLE(ao);
}
template <int um, int um_vecs, int uk, int nelems, bool ktail>
EIGEN_ALWAYS_INLINE std::enable_if_t<(um <= um_vecs)> load_a(const Scalar *ao) {
EIGEN_ALWAYS_INLINE std::enable_if_t<(um <= um_vecs)> load_a(const Scalar* ao) {
if (um < um_vecs) {
auto &a_reg = zmm[a_regs[um + (uk % 2) * 3]];
const Scalar *a_addr = ao + nelems * (1 + !ktail * !use_less_a_regs + uk) + nelems_in_cache_line * um - a_shift;
auto& a_reg = zmm[a_regs[um + (uk % 2) * 3]];
const Scalar* a_addr = ao + nelems * (1 + !ktail * !use_less_a_regs + uk) + nelems_in_cache_line * um - a_shift;
a_load<nelems>(a_reg, a_addr);
load_a<um + 1, um_vecs, uk, nelems, ktail>(ao);
}
}
template <int uk, int pow, int count, int um_vecs, int b_unroll, bool ktail, bool fetch_x, bool c_fetch>
EIGEN_ALWAYS_INLINE std::enable_if_t<(count > (pow + 1) / 2)> innerkernel_1pow(const Scalar *&aa,
const Scalar *const &ao,
const Scalar *const &bo, Scalar *&co2,
int &fetchA_idx, int &fetchB_idx) {
EIGEN_ALWAYS_INLINE std::enable_if_t<(count > (pow + 1) / 2)> innerkernel_1pow(const Scalar*& aa,
const Scalar* const& ao,
const Scalar* const& bo, Scalar*& co2,
int& fetchA_idx, int& fetchB_idx) {
EIGEN_UNUSED_VARIABLE(aa);
EIGEN_UNUSED_VARIABLE(ao);
EIGEN_UNUSED_VARIABLE(bo);
@@ -606,14 +606,14 @@ class gemm_class {
}
template <int uk, int pow, int count, int um_vecs, int b_unroll, bool ktail, bool fetch_x, bool c_fetch>
EIGEN_ALWAYS_INLINE std::enable_if_t<(count <= (pow + 1) / 2)> innerkernel_1pow(const Scalar *&aa,
const Scalar *const &ao,
const Scalar *const &bo, Scalar *&co2,
int &fetchA_idx, int &fetchB_idx) {
EIGEN_ALWAYS_INLINE std::enable_if_t<(count <= (pow + 1) / 2)> innerkernel_1pow(const Scalar*& aa,
const Scalar* const& ao,
const Scalar* const& bo, Scalar*& co2,
int& fetchA_idx, int& fetchB_idx) {
const int idx = (pow / 2) + count;
if (count < (pow + 1) / 2) {
auto &b_reg = zmm[b_regs[idx % 2]];
auto& b_reg = zmm[b_regs[idx % 2]];
if (fetch_x && uk == 3 && idx == 0) prefetch_x(aa);
if (fetch_x && uk == 3 && idx == 4) aa += 8;
@@ -621,7 +621,7 @@ class gemm_class {
if (b_unroll >= pow) {
compute<0, um_vecs, idx, uk, fetch_x, ktail>(ao, bo, fetchA_idx, fetchB_idx, b_reg);
const Scalar *b_addr = bo + b_unroll * uk + idx + 1 + (b_unroll > 1) * !use_less_b_regs - b_shift;
const Scalar* b_addr = bo + b_unroll * uk + idx + 1 + (b_unroll > 1) * !use_less_b_regs - b_shift;
b_load(b_reg, b_addr);
}
@@ -643,8 +643,8 @@ class gemm_class {
template <int uk, int max_b_unroll, int a_unroll, int b_unroll, bool ktail, bool fetch_x, bool c_fetch,
bool no_a_preload = false>
EIGEN_ALWAYS_INLINE void innerkernel_1uk(const Scalar *&aa, const Scalar *const &ao, const Scalar *const &bo,
Scalar *&co2, int &fetchA_idx, int &fetchB_idx) {
EIGEN_ALWAYS_INLINE void innerkernel_1uk(const Scalar*& aa, const Scalar* const& ao, const Scalar* const& bo,
Scalar*& co2, int& fetchA_idx, int& fetchB_idx) {
const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line);
if (max_b_unroll >= 1)
@@ -701,7 +701,7 @@ class gemm_class {
template <int a_unroll, int b_unroll, int k_factor, int max_b_unroll, int max_k_factor, bool c_fetch,
bool no_a_preload = false>
EIGEN_ALWAYS_INLINE void innerkernel(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co2) {
EIGEN_ALWAYS_INLINE void innerkernel(const Scalar*& aa, const Scalar*& ao, const Scalar*& bo, Scalar*& co2) {
int fetchA_idx = 0;
int fetchB_idx = 0;
@@ -731,7 +731,7 @@ class gemm_class {
}
template <int a_unroll, int b_unroll, int max_b_unroll>
EIGEN_ALWAYS_INLINE void kloop(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) {
EIGEN_ALWAYS_INLINE void kloop(const Scalar*& aa, const Scalar*& ao, const Scalar*& bo, Scalar*& co1, Scalar*& co2) {
const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line);
if (!use_less_a_regs && k > 1)
a_loads<0, 2, 0, um_vecs, a_unroll>(ao);
@@ -795,7 +795,7 @@ class gemm_class {
}
template <int a_unroll, int b_unroll, int max_b_unroll>
EIGEN_ALWAYS_INLINE void nloop(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) {
EIGEN_ALWAYS_INLINE void nloop(const Scalar*& aa, const Scalar*& ao, const Scalar*& bo, Scalar*& co1, Scalar*& co2) {
// Set A matrix pointer.
ao = a + a_off * a_unroll;
@@ -812,9 +812,9 @@ class gemm_class {
}
template <int a_unroll, int max_a_unroll, int max_b_unroll>
EIGEN_ALWAYS_INLINE void mloop(const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) {
EIGEN_ALWAYS_INLINE void mloop(const Scalar*& ao, const Scalar*& bo, Scalar*& co1, Scalar*& co2) {
// Set prefetch A pointers.
const Scalar *aa = a + a_unroll * a_stride;
const Scalar* aa = a + a_unroll * a_stride;
// Set C matrix pointers.
co1 = c;
@@ -856,10 +856,10 @@ class gemm_class {
a -= -a_shift;
b -= -b_shift;
const Scalar *ao = nullptr;
const Scalar *bo = nullptr;
Scalar *co1 = nullptr;
Scalar *co2 = nullptr;
const Scalar* ao = nullptr;
const Scalar* bo = nullptr;
Scalar* co1 = nullptr;
Scalar* co2 = nullptr;
// Main m-loop.
for (; m >= max_a_unroll; m -= max_a_unroll) mloop<max_a_unroll, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
@@ -883,8 +883,8 @@ class gemm_class {
}
}
gemm_class(Index m_, Index n_, Index k_, Index ldc_, Index inc_, const Scalar *alpha_, const Scalar *a_,
const Scalar *b_, Scalar *c_, bool is_alpha1_, bool is_beta0_, Index a_stride_, Index b_stride_,
gemm_class(Index m_, Index n_, Index k_, Index ldc_, Index inc_, const Scalar* alpha_, const Scalar* a_,
const Scalar* b_, Scalar* c_, bool is_alpha1_, bool is_beta0_, Index a_stride_, Index b_stride_,
Index a_off_, Index b_off_)
: m(m_),
n(n_),
@@ -937,8 +937,8 @@ class gemm_class {
// max_a_unroll: 24, 16, 8, 4, 2, 1
// max_b_unroll: 8, 4, 2, 1
template <typename Scalar, int max_a_unroll, int max_b_unroll, bool is_alpha1, bool is_beta0, bool is_unit_inc>
EIGEN_DONT_INLINE void gemm_kern_avx512(Index m, Index n, Index k, Scalar *alpha, const Scalar *a, const Scalar *b,
Scalar *c, Index ldc, Index inc = 1, Index a_stride = -1, Index b_stride = -1,
EIGEN_DONT_INLINE void gemm_kern_avx512(Index m, Index n, Index k, Scalar* alpha, const Scalar* a, const Scalar* b,
Scalar* c, Index ldc, Index inc = 1, Index a_stride = -1, Index b_stride = -1,
Index a_off = 0, Index b_off = 0) {
if (a_stride == -1) a_stride = k;
if (b_stride == -1) b_stride = k;
@@ -972,13 +972,13 @@ struct gemm_pack_rhs<Scalar, Index, DataMapper, 8, ColMajor, Conjugate, PanelMod
typedef typename packet_traits<Scalar>::type Packet;
typedef typename DataMapper::LinearMapper LinearMapper;
enum { PacketSize = packet_traits<Scalar>::size };
EIGEN_DONT_INLINE void operator()(Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride = 0,
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
Index offset = 0);
};
template <typename Scalar, typename Index, typename DataMapper, bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, 8, ColMajor, Conjugate, PanelMode>::operator()(
Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride, Index offset) {
Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
constexpr int nr = 8;
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
EIGEN_UNUSED_VARIABLE(stride);
@@ -1106,7 +1106,7 @@ struct gemm_pack_rhs<Scalar, Index, DataMapper, 8, RowMajor, Conjugate, PanelMod
HalfPacketSize = unpacket_traits<HalfPacket>::size,
QuarterPacketSize = unpacket_traits<QuarterPacket>::size
};
EIGEN_DONT_INLINE void operator()(Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride = 0,
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
Index offset = 0) {
constexpr int nr = 8;
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
@@ -1205,33 +1205,32 @@ struct gemm_pack_rhs<Scalar, Index, DataMapper, 8, RowMajor, Conjugate, PanelMod
template <typename Scalar, typename Index, typename DataMapper, int mr, bool ConjugateLhs, bool ConjugateRhs>
struct gebp_kernel<Scalar, Scalar, Index, DataMapper, mr, 8, ConjugateLhs, ConjugateRhs> {
EIGEN_ALWAYS_INLINE void operator()(const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index rows,
EIGEN_ALWAYS_INLINE void operator()(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows,
Index depth, Index cols, Scalar alpha, Index strideA = -1, Index strideB = -1,
Index offsetA = 0, Index offsetB = 0);
};
template <typename Scalar, typename Index, typename DataMapper, int mr, bool ConjugateLhs, bool ConjugateRhs>
EIGEN_ALWAYS_INLINE void gebp_kernel<Scalar, Scalar, Index, DataMapper, mr, 8, ConjugateLhs, ConjugateRhs>::operator()(
const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index rows, Index depth, Index cols,
const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols,
Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
if (res.incr() == 1) {
if (alpha == 1) {
gemm_kern_avx512<Scalar, mr, 8, true, false, true>(rows, cols, depth, &alpha, blockA, blockB,
(Scalar *)res.data(), res.stride(), res.incr(), strideA,
strideB, offsetA, offsetB);
gemm_kern_avx512<Scalar, mr, 8, true, false, true>(rows, cols, depth, &alpha, blockA, blockB, (Scalar*)res.data(),
res.stride(), res.incr(), strideA, strideB, offsetA, offsetB);
} else {
gemm_kern_avx512<Scalar, mr, 8, false, false, true>(rows, cols, depth, &alpha, blockA, blockB,
(Scalar *)res.data(), res.stride(), res.incr(), strideA,
(Scalar*)res.data(), res.stride(), res.incr(), strideA,
strideB, offsetA, offsetB);
}
} else {
if (alpha == 1) {
gemm_kern_avx512<Scalar, mr, 8, true, false, false>(rows, cols, depth, &alpha, blockA, blockB,
(Scalar *)res.data(), res.stride(), res.incr(), strideA,
(Scalar*)res.data(), res.stride(), res.incr(), strideA,
strideB, offsetA, offsetB);
} else {
gemm_kern_avx512<Scalar, mr, 8, false, false, false>(rows, cols, depth, &alpha, blockA, blockB,
(Scalar *)res.data(), res.stride(), res.incr(), strideA,
(Scalar*)res.data(), res.stride(), res.incr(), strideA,
strideB, offsetA, offsetB);
}
}

View File

@@ -100,7 +100,6 @@ struct packet_traits<half> : default_packet_traits {
HasCos = EIGEN_FAST_MATH,
HasTanh = EIGEN_FAST_MATH,
HasErf = EIGEN_FAST_MATH,
HasBlend = 0
};
};
#endif
@@ -118,7 +117,6 @@ struct packet_traits<float> : default_packet_traits {
HasMin = 1,
HasMax = 1,
HasConj = 1,
HasBlend = 1,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
HasACos = 1,
@@ -151,7 +149,6 @@ struct packet_traits<double> : default_packet_traits {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 8,
HasBlend = 1,
HasSqrt = 1,
HasRsqrt = 1,
HasCbrt = 1,
@@ -176,7 +173,7 @@ template <>
struct packet_traits<int> : default_packet_traits {
typedef Packet16i type;
typedef Packet8i half;
enum { Vectorizable = 1, AlignedOnScalar = 1, HasBlend = 0, HasCmp = 1, HasDiv = 1, size = 16 };
enum { Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, HasDiv = 1, size = 16 };
};
template <>
@@ -2512,7 +2509,6 @@ struct packet_traits<bfloat16> : default_packet_traits {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 16,
HasBlend = 0,
HasInsert = 1,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,

View File

@@ -59,7 +59,6 @@ struct packet_traits<half> : default_packet_traits {
HasCos = EIGEN_FAST_MATH,
HasTanh = EIGEN_FAST_MATH,
HasErf = 0, // EIGEN_FAST_MATH,
HasBlend = 0
};
};

View File

@@ -44,7 +44,7 @@
namespace Eigen {
namespace internal {
#if (EIGEN_USE_AVX512_TRSM_KERNELS) && (EIGEN_COMP_CLANG != 0)
#if (EIGEN_USE_AVX512_TRSM_KERNELS)
#define EIGEN_AVX_MAX_NUM_ACC (int64_t(24))
#define EIGEN_AVX_MAX_NUM_ROW (int64_t(8)) // Denoted L in code.
@@ -60,6 +60,8 @@ typedef Packet4d vecHalfDouble;
// Note: this depends on macros and typedefs above.
#include "TrsmUnrolls.inc"
#if (EIGEN_COMP_CLANG != 0)
/**
* For smaller problem sizes, and certain compilers, using the optimized kernels trsmKernelL/R directly
* is faster than the packed versions in TriangularSolverMatrix.h.
@@ -119,7 +121,7 @@ int64_t avx512_trsm_cutoff(int64_t L2Size, int64_t N, double L2Cap) {
* Used by gemmKernel for the case A/B row-major and C col-major.
*/
template <typename Scalar, typename vec, int64_t unrollM, int64_t unrollN, bool remM, bool remN>
EIGEN_ALWAYS_INLINE void transStoreC(PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, Scalar *C_arr,
EIGEN_ALWAYS_INLINE void transStoreC(PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS>& zmm, Scalar* C_arr,
int64_t LDC, int64_t remM_ = 0, int64_t remN_ = 0) {
EIGEN_UNUSED_VARIABLE(remN_);
EIGEN_UNUSED_VARIABLE(remM_);
@@ -219,7 +221,7 @@ EIGEN_ALWAYS_INLINE void transStoreC(PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_
* handleKRem: Handle arbitrary K? This is not needed for trsm.
*/
template <typename Scalar, bool isARowMajor, bool isCRowMajor, bool isAdd, bool handleKRem>
void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t N, int64_t K, int64_t LDA, int64_t LDB,
void gemmKernel(Scalar* A_arr, Scalar* B_arr, Scalar* C_arr, int64_t M, int64_t N, int64_t K, int64_t LDA, int64_t LDB,
int64_t LDC) {
using urolls = unrolls::gemm<Scalar, isAdd>;
constexpr int64_t U3 = urolls::PacketSize * 3;
@@ -262,8 +264,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
}
}
if (M - i >= 4) { // Note: this block assumes EIGEN_AVX_MAX_NUM_ROW = 8. Should be removed otherwise
Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar *B_t = &B_arr[0 * LDB + j];
Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar* B_t = &B_arr[0 * LDB + j];
PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
urolls::template setzero<3, 4>(zmm);
for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
@@ -292,8 +294,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
i += 4;
}
if (M - i >= 2) {
Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar *B_t = &B_arr[0 * LDB + j];
Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar* B_t = &B_arr[0 * LDB + j];
PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
urolls::template setzero<3, 2>(zmm);
for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
@@ -322,8 +324,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
i += 2;
}
if (M - i > 0) {
Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar *B_t = &B_arr[0 * LDB + j];
Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar* B_t = &B_arr[0 * LDB + j];
PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
urolls::template setzero<3, 1>(zmm);
{
@@ -385,8 +387,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
}
}
if (M - i >= 4) { // Note: this block assumes EIGEN_AVX_MAX_NUM_ROW = 8. Should be removed otherwise
Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar *B_t = &B_arr[0 * LDB + j];
Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar* B_t = &B_arr[0 * LDB + j];
PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
urolls::template setzero<2, 4>(zmm);
for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
@@ -415,8 +417,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
i += 4;
}
if (M - i >= 2) {
Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar *B_t = &B_arr[0 * LDB + j];
Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar* B_t = &B_arr[0 * LDB + j];
PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
urolls::template setzero<2, 2>(zmm);
for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
@@ -445,8 +447,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
i += 2;
}
if (M - i > 0) {
Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar *B_t = &B_arr[0 * LDB + j];
Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar* B_t = &B_arr[0 * LDB + j];
PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
urolls::template setzero<2, 1>(zmm);
for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
@@ -506,8 +508,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
}
}
if (M - i >= 4) { // Note: this block assumes EIGEN_AVX_MAX_NUM_ROW = 8. Should be removed otherwise
Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar *B_t = &B_arr[0 * LDB + j];
Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar* B_t = &B_arr[0 * LDB + j];
PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
urolls::template setzero<1, 4>(zmm);
for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
@@ -536,8 +538,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
i += 4;
}
if (M - i >= 2) {
Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar *B_t = &B_arr[0 * LDB + j];
Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar* B_t = &B_arr[0 * LDB + j];
PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
urolls::template setzero<1, 2>(zmm);
for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
@@ -566,8 +568,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
i += 2;
}
if (M - i > 0) {
Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar *B_t = &B_arr[0 * LDB + j];
Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar* B_t = &B_arr[0 * LDB + j];
PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
urolls::template setzero<1, 1>(zmm);
{
@@ -601,8 +603,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
constexpr int64_t EIGEN_AVX_MAX_B_LOAD = EIGEN_AVX_B_LOAD_SETS * 1;
int64_t i = 0;
for (; i < M_; i += EIGEN_AVX_MAX_NUM_ROW) {
Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar *B_t = &B_arr[0 * LDB + j];
Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar* B_t = &B_arr[0 * LDB + j];
PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
urolls::template setzero<1, EIGEN_AVX_MAX_NUM_ROW>(zmm);
for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
@@ -630,8 +632,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
}
}
if (M - i >= 4) { // Note: this block assumes EIGEN_AVX_MAX_NUM_ROW = 8. Should be removed otherwise
Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar *B_t = &B_arr[0 * LDB + j];
Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar* B_t = &B_arr[0 * LDB + j];
PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
urolls::template setzero<1, 4>(zmm);
for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
@@ -660,8 +662,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
i += 4;
}
if (M - i >= 2) {
Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar *B_t = &B_arr[0 * LDB + j];
Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar* B_t = &B_arr[0 * LDB + j];
PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
urolls::template setzero<1, 2>(zmm);
for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
@@ -690,8 +692,8 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
i += 2;
}
if (M - i > 0) {
Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar *B_t = &B_arr[0 * LDB + j];
Scalar* A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
Scalar* B_t = &B_arr[0 * LDB + j];
PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
urolls::template setzero<1, 1>(zmm);
for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
@@ -730,7 +732,7 @@ void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t
* The B matrix (RHS) is assumed to be row-major
*/
template <typename Scalar, typename vec, int64_t unrollM, bool isARowMajor, bool isFWDSolve, bool isUnitDiag>
EIGEN_ALWAYS_INLINE void triSolveKernel(Scalar *A_arr, Scalar *B_arr, int64_t K, int64_t LDA, int64_t LDB) {
EIGEN_ALWAYS_INLINE void triSolveKernel(Scalar* A_arr, Scalar* B_arr, int64_t K, int64_t LDA, int64_t LDB) {
static_assert(unrollM <= EIGEN_AVX_MAX_NUM_ROW, "unrollM should be equal to EIGEN_AVX_MAX_NUM_ROW");
using urolls = unrolls::trsm<Scalar>;
constexpr int64_t U3 = urolls::PacketSize * 3;
@@ -780,7 +782,7 @@ EIGEN_ALWAYS_INLINE void triSolveKernel(Scalar *A_arr, Scalar *B_arr, int64_t K,
* The B matrix (RHS) is assumed to be row-major
*/
template <typename Scalar, bool isARowMajor, bool isFWDSolve, bool isUnitDiag>
void triSolveKernelLxK(Scalar *A_arr, Scalar *B_arr, int64_t M, int64_t K, int64_t LDA, int64_t LDB) {
void triSolveKernelLxK(Scalar* A_arr, Scalar* B_arr, int64_t M, int64_t K, int64_t LDA, int64_t LDB) {
// Note: this assumes EIGEN_AVX_MAX_NUM_ROW = 8. Unrolls should be adjusted
// accordingly if EIGEN_AVX_MAX_NUM_ROW is smaller.
using vec = typename std::conditional<std::is_same<Scalar, float>::value, vecFullFloat, vecFullDouble>::type;
@@ -811,7 +813,7 @@ void triSolveKernelLxK(Scalar *A_arr, Scalar *B_arr, int64_t M, int64_t K, int64
*
*/
template <typename Scalar, bool toTemp = true, bool remM = false>
EIGEN_ALWAYS_INLINE void copyBToRowMajor(Scalar *B_arr, int64_t LDB, int64_t K, Scalar *B_temp, int64_t LDB_,
EIGEN_ALWAYS_INLINE void copyBToRowMajor(Scalar* B_arr, int64_t LDB, int64_t K, Scalar* B_temp, int64_t LDB_,
int64_t remM_ = 0) {
EIGEN_UNUSED_VARIABLE(remM_);
using urolls = unrolls::transB<Scalar>;
@@ -898,7 +900,7 @@ EIGEN_ALWAYS_INLINE void copyBToRowMajor(Scalar *B_arr, int64_t LDB, int64_t K,
*/
template <typename Scalar, bool isARowMajor = true, bool isBRowMajor = true, bool isFWDSolve = true,
bool isUnitDiag = false>
void triSolve(Scalar *A_arr, Scalar *B_arr, int64_t M, int64_t numRHS, int64_t LDA, int64_t LDB) {
void triSolve(Scalar* A_arr, Scalar* B_arr, int64_t M, int64_t numRHS, int64_t LDA, int64_t LDB) {
constexpr int64_t psize = packet_traits<Scalar>::size;
/**
* The values for kB, numM were determined experimentally.
@@ -917,7 +919,7 @@ void triSolve(Scalar *A_arr, Scalar *B_arr, int64_t M, int64_t numRHS, int64_t L
constexpr int64_t numM = 8 * EIGEN_AVX_MAX_NUM_ROW;
int64_t sizeBTemp = 0;
Scalar *B_temp = NULL;
Scalar* B_temp = NULL;
EIGEN_IF_CONSTEXPR(!isBRowMajor) {
/**
* If B is col-major, we copy it to a fixed-size temporary array of size at most ~numM*kB and
@@ -927,7 +929,7 @@ void triSolve(Scalar *A_arr, Scalar *B_arr, int64_t M, int64_t numRHS, int64_t L
sizeBTemp = (((std::min(kB, numRHS) + psize - 1) / psize + 4) * psize) * numM;
}
EIGEN_IF_CONSTEXPR(!isBRowMajor) B_temp = (Scalar *)handmade_aligned_malloc(sizeof(Scalar) * sizeBTemp, 64);
EIGEN_IF_CONSTEXPR(!isBRowMajor) B_temp = (Scalar*)handmade_aligned_malloc(sizeof(Scalar) * sizeBTemp, 64);
for (int64_t k = 0; k < numRHS; k += kB) {
int64_t bK = numRHS - k > kB ? kB : numRHS - k;
@@ -1061,7 +1063,6 @@ void triSolve(Scalar *A_arr, Scalar *B_arr, int64_t M, int64_t numRHS, int64_t L
}
// Template specializations of trsmKernelL/R for float/double and inner strides of 1.
#if (EIGEN_USE_AVX512_TRSM_KERNELS)
#if (EIGEN_USE_AVX512_TRSM_R_KERNELS)
template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride,
bool Specialized>
@@ -1069,19 +1070,19 @@ struct trsmKernelR;
template <typename Index, int Mode, int TriStorageOrder>
struct trsmKernelR<float, Index, Mode, false, TriStorageOrder, 1, true> {
static void kernel(Index size, Index otherSize, const float *_tri, Index triStride, float *_other, Index otherIncr,
static void kernel(Index size, Index otherSize, const float* _tri, Index triStride, float* _other, Index otherIncr,
Index otherStride);
};
template <typename Index, int Mode, int TriStorageOrder>
struct trsmKernelR<double, Index, Mode, false, TriStorageOrder, 1, true> {
static void kernel(Index size, Index otherSize, const double *_tri, Index triStride, double *_other, Index otherIncr,
static void kernel(Index size, Index otherSize, const double* _tri, Index triStride, double* _other, Index otherIncr,
Index otherStride);
};
template <typename Index, int Mode, int TriStorageOrder>
EIGEN_DONT_INLINE void trsmKernelR<float, Index, Mode, false, TriStorageOrder, 1, true>::kernel(
Index size, Index otherSize, const float *_tri, Index triStride, float *_other, Index otherIncr,
Index size, Index otherSize, const float* _tri, Index triStride, float* _other, Index otherIncr,
Index otherStride) {
EIGEN_UNUSED_VARIABLE(otherIncr);
#ifdef EIGEN_RUNTIME_NO_MALLOC
@@ -1092,12 +1093,12 @@ EIGEN_DONT_INLINE void trsmKernelR<float, Index, Mode, false, TriStorageOrder, 1
}
#endif
triSolve<float, TriStorageOrder != RowMajor, true, (Mode & Lower) != Lower, (Mode & UnitDiag) != 0>(
const_cast<float *>(_tri), _other, size, otherSize, triStride, otherStride);
const_cast<float*>(_tri), _other, size, otherSize, triStride, otherStride);
}
template <typename Index, int Mode, int TriStorageOrder>
EIGEN_DONT_INLINE void trsmKernelR<double, Index, Mode, false, TriStorageOrder, 1, true>::kernel(
Index size, Index otherSize, const double *_tri, Index triStride, double *_other, Index otherIncr,
Index size, Index otherSize, const double* _tri, Index triStride, double* _other, Index otherIncr,
Index otherStride) {
EIGEN_UNUSED_VARIABLE(otherIncr);
#ifdef EIGEN_RUNTIME_NO_MALLOC
@@ -1108,7 +1109,7 @@ EIGEN_DONT_INLINE void trsmKernelR<double, Index, Mode, false, TriStorageOrder,
}
#endif
triSolve<double, TriStorageOrder != RowMajor, true, (Mode & Lower) != Lower, (Mode & UnitDiag) != 0>(
const_cast<double *>(_tri), _other, size, otherSize, triStride, otherStride);
const_cast<double*>(_tri), _other, size, otherSize, triStride, otherStride);
}
#endif // (EIGEN_USE_AVX512_TRSM_R_KERNELS)
@@ -1120,19 +1121,19 @@ struct trsmKernelL;
template <typename Index, int Mode, int TriStorageOrder>
struct trsmKernelL<float, Index, Mode, false, TriStorageOrder, 1, true> {
static void kernel(Index size, Index otherSize, const float *_tri, Index triStride, float *_other, Index otherIncr,
static void kernel(Index size, Index otherSize, const float* _tri, Index triStride, float* _other, Index otherIncr,
Index otherStride);
};
template <typename Index, int Mode, int TriStorageOrder>
struct trsmKernelL<double, Index, Mode, false, TriStorageOrder, 1, true> {
static void kernel(Index size, Index otherSize, const double *_tri, Index triStride, double *_other, Index otherIncr,
static void kernel(Index size, Index otherSize, const double* _tri, Index triStride, double* _other, Index otherIncr,
Index otherStride);
};
template <typename Index, int Mode, int TriStorageOrder>
EIGEN_DONT_INLINE void trsmKernelL<float, Index, Mode, false, TriStorageOrder, 1, true>::kernel(
Index size, Index otherSize, const float *_tri, Index triStride, float *_other, Index otherIncr,
Index size, Index otherSize, const float* _tri, Index triStride, float* _other, Index otherIncr,
Index otherStride) {
EIGEN_UNUSED_VARIABLE(otherIncr);
#ifdef EIGEN_RUNTIME_NO_MALLOC
@@ -1143,12 +1144,12 @@ EIGEN_DONT_INLINE void trsmKernelL<float, Index, Mode, false, TriStorageOrder, 1
}
#endif
triSolve<float, TriStorageOrder == RowMajor, false, (Mode & Lower) == Lower, (Mode & UnitDiag) != 0>(
const_cast<float *>(_tri), _other, size, otherSize, triStride, otherStride);
const_cast<float*>(_tri), _other, size, otherSize, triStride, otherStride);
}
template <typename Index, int Mode, int TriStorageOrder>
EIGEN_DONT_INLINE void trsmKernelL<double, Index, Mode, false, TriStorageOrder, 1, true>::kernel(
Index size, Index otherSize, const double *_tri, Index triStride, double *_other, Index otherIncr,
Index size, Index otherSize, const double* _tri, Index triStride, double* _other, Index otherIncr,
Index otherStride) {
EIGEN_UNUSED_VARIABLE(otherIncr);
#ifdef EIGEN_RUNTIME_NO_MALLOC
@@ -1159,10 +1160,12 @@ EIGEN_DONT_INLINE void trsmKernelL<double, Index, Mode, false, TriStorageOrder,
}
#endif
triSolve<double, TriStorageOrder == RowMajor, false, (Mode & Lower) == Lower, (Mode & UnitDiag) != 0>(
const_cast<double *>(_tri), _other, size, otherSize, triStride, otherStride);
const_cast<double*>(_tri), _other, size, otherSize, triStride, otherStride);
}
#endif // EIGEN_USE_AVX512_TRSM_L_KERNELS
#endif // EIGEN_USE_AVX512_TRSM_KERNELS
} // namespace internal
} // namespace Eigen
#endif // EIGEN_CORE_ARCH_AVX512_TRSM_KERNEL_H

View File

@@ -84,7 +84,6 @@ struct packet_traits<float> : default_packet_traits {
HasIGammac = 1,
HasBetaInc = 1,
HasBlend = 0,
HasFloor = 1,
HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS
};
@@ -117,7 +116,6 @@ struct packet_traits<double> : default_packet_traits {
HasGammaSampleDerAlpha = 1,
HasIGammac = 1,
HasBetaInc = 1,
HasBlend = 0,
};
};

View File

@@ -157,10 +157,7 @@ struct packet_traits<float> : default_packet_traits {
HasMax = 1,
HasConj = 0,
HasSetLinear = 0,
HasBlend = 0,
HasDiv = 0,
HasSin = 0,
HasCos = 0,
HasACos = 0,
@@ -240,18 +237,18 @@ EIGEN_STRONG_INLINE Packet8f pzero<Packet8f>(const Packet8f&) {
}
template <HVXPacketSize T>
EIGEN_STRONG_INLINE typename unpacket_traits<HVXPacket<T>>::half predux_half_dowto4_hvx(const HVXPacket<T>& a) {
EIGEN_STRONG_INLINE typename unpacket_traits<HVXPacket<T>>::half predux_half_hvx(const HVXPacket<T>& a) {
const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
return unpacket_traits<HVXPacket<T>>::half::Create(
Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_vror_VR(a.Get(), sizeof(float) * packet_size / 2), a.Get())));
}
template <>
EIGEN_STRONG_INLINE Packet16f predux_half_dowto4(const Packet32f& a) {
return predux_half_dowto4_hvx(a);
EIGEN_STRONG_INLINE Packet16f predux_half(const Packet32f& a) {
return predux_half_hvx(a);
}
template <>
EIGEN_STRONG_INLINE Packet8f predux_half_dowto4(const Packet16f& a) {
return predux_half_dowto4_hvx(a);
EIGEN_STRONG_INLINE Packet8f predux_half(const Packet16f& a) {
return predux_half_hvx(a);
}
template <HVXPacketSize T>

View File

@@ -1379,6 +1379,47 @@ EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) {
return a;
}
template <>
EIGEN_STRONG_INLINE Packet16c pabsdiff(const Packet16c& a, const Packet16c& b) {
return __lsx_vabsd_b(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet8s pabsdiff(const Packet8s& a, const Packet8s& b) {
return __lsx_vabsd_h(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet4i pabsdiff(const Packet4i& a, const Packet4i& b) {
return __lsx_vabsd_w(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet2l pabsdiff(const Packet2l& a, const Packet2l& b) {
return __lsx_vabsd_d(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet16uc pabsdiff(const Packet16uc& a, const Packet16uc& b) {
return __lsx_vabsd_bu(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet8us pabsdiff(const Packet8us& a, const Packet8us& b) {
return __lsx_vabsd_hu(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet4ui pabsdiff(const Packet4ui& a, const Packet4ui& b) {
return __lsx_vabsd_wu(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet2ul pabsdiff(const Packet2ul& a, const Packet2ul& b) {
return __lsx_vabsd_du(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet4f pabsdiff(const Packet4f& a, const Packet4f& b) {
return pabs(psub(a, b));
}
template <>
EIGEN_STRONG_INLINE Packet2d pabsdiff(const Packet2d& a, const Packet2d& b) {
return pabs(psub(a, b));
}
template <>
EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__lsx_vld(from, 0);
@@ -2667,11 +2708,6 @@ EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /* a */) {
return v;
}
template <>
EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(const Packet4f& a, const Packet4f& b) {
Packet4f v = psub(a, b);
return pabs(v);
}
template <>
EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
return pmin<Packet4f>(a, b);
}
@@ -2733,48 +2769,23 @@ template <>
EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
return pldexp_generic(a, exponent);
}
template <>
EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(const Packet16c& a, const Packet16c& b) {
Packet16c v = psub(a, b);
return pabs(v);
}
template <>
EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(const Packet8s& a, const Packet8s& b) {
Packet8s v = psub(a, b);
return pabs(v);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b) {
return __lsx_vbitsel_v(b, a, mask);
}
template <>
EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(const Packet4i& a, const Packet4i& b) {
Packet4i v = psub(a, b);
return pabs(v);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
return __lsx_vbitsel_v(b, a, mask);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) {
return __lsx_vbitsel_v(b, a, mask);
}
template <>
EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
return __lsx_vdiv_bu(a, b);
}
template <>
EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
Packet16uc v = psub(a, b);
return pabs(v);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a,
const Packet16uc& b) {
return __lsx_vbitsel_v(b, a, mask);
@@ -2791,12 +2802,6 @@ EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) {
}
return res;
}
template <>
EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(const Packet8us& a, const Packet8us& b) {
Packet8us v = psub(a, b);
return pabs(v);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b) {
return __lsx_vbitsel_v(b, a, mask);
@@ -2814,11 +2819,6 @@ EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) {
return res;
}
template <>
EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
Packet4ui v = psub(a, b);
return pabs(v);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
return __lsx_vbitsel_v(b, a, mask);

View File

@@ -200,7 +200,6 @@ struct packet_traits<float> : default_packet_traits {
HasTanh = EIGEN_FAST_MATH,
HasErf = EIGEN_FAST_MATH,
HasErfc = EIGEN_FAST_MATH,
HasBlend = 1,
HasSign = 0 // The manually vectorized version is slightly slower for SSE.
};
};
@@ -230,7 +229,6 @@ struct packet_traits<double> : default_packet_traits {
HasCbrt = 1,
HasATan = 1,
HasATanh = 1,
HasBlend = 1
};
};
template <>
@@ -245,7 +243,6 @@ struct packet_traits<int> : default_packet_traits {
HasCmp = 1,
HasDiv = 1,
HasShift = 1,
HasBlend = 1
};
};
template <>
@@ -257,11 +254,9 @@ struct packet_traits<uint32_t> : default_packet_traits {
AlignedOnScalar = 1,
size = 4,
HasDiv = 0,
HasNegate = 0,
HasCmp = 1,
HasShift = 1,
HasBlend = 1
};
};
template <>
@@ -273,10 +268,8 @@ struct packet_traits<int64_t> : default_packet_traits {
AlignedOnScalar = 1,
size = 2,
HasDiv = 0,
HasCmp = 1,
HasShift = 1,
HasBlend = 1
};
};
#endif

View File

@@ -30,7 +30,7 @@ namespace Eigen {
namespace internal {
template <int has_blend, int lengths>
template <int lengths>
struct sycl_packet_traits : default_packet_traits {
enum {
Vectorizable = 1,
@@ -60,7 +60,6 @@ struct sycl_packet_traits : default_packet_traits {
HasIGamma = 0,
HasIGammac = 0,
HasBetaInc = 0,
HasBlend = has_blend,
// This flag is used to indicate whether packet comparison is supported.
// pcmp_eq, pcmp_lt and pcmp_le should be defined for it to be true.
HasCmp = 1,
@@ -78,19 +77,19 @@ struct sycl_packet_traits : default_packet_traits {
};
#ifdef SYCL_DEVICE_ONLY
#define SYCL_PACKET_TRAITS(packet_type, has_blend, unpacket_type, lengths) \
template <> \
struct packet_traits<unpacket_type> : sycl_packet_traits<has_blend, lengths> { \
typedef packet_type type; \
typedef packet_type half; \
#define SYCL_PACKET_TRAITS(packet_type, unpacket_type, lengths) \
template <> \
struct packet_traits<unpacket_type> : sycl_packet_traits<lengths> { \
typedef packet_type type; \
typedef packet_type half; \
};
SYCL_PACKET_TRAITS(cl::sycl::cl_half8, 1, Eigen::half, 8)
SYCL_PACKET_TRAITS(cl::sycl::cl_half8, 1, const Eigen::half, 8)
SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4)
SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4)
SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2)
SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, const double, 2)
SYCL_PACKET_TRAITS(cl::sycl::cl_half8, Eigen::half, 8)
SYCL_PACKET_TRAITS(cl::sycl::cl_half8, const Eigen::half, 8)
SYCL_PACKET_TRAITS(cl::sycl::cl_float4, float, 4)
SYCL_PACKET_TRAITS(cl::sycl::cl_float4, const float, 4)
SYCL_PACKET_TRAITS(cl::sycl::cl_double2, double, 2)
SYCL_PACKET_TRAITS(cl::sycl::cl_double2, const double, 2)
#undef SYCL_PACKET_TRAITS
// Make sure this is only available when targeting a GPU: we don't want to
@@ -135,14 +134,14 @@ template <typename PacketReturnType, int PacketSize>
struct PacketWrapper {
typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type Scalar;
template <typename Index>
EIGEN_DEVICE_FUNC static Scalar scalarize(Index, PacketReturnType &) {
EIGEN_DEVICE_FUNC static Scalar scalarize(Index, PacketReturnType&) {
eigen_assert(false && "THERE IS NO PACKETIZE VERSION FOR THE CHOSEN TYPE");
abort();
}
EIGEN_DEVICE_FUNC static PacketReturnType convert_to_packet_type(Scalar in, Scalar) {
return ::Eigen::internal::template plset<PacketReturnType>(in);
}
EIGEN_DEVICE_FUNC static void set_packet(PacketReturnType, Scalar *) {
EIGEN_DEVICE_FUNC static void set_packet(PacketReturnType, Scalar*) {
eigen_assert(false && "THERE IS NO PACKETIZE VERSION FOR THE CHOSEN TYPE");
abort();
}
@@ -153,7 +152,7 @@ template <typename PacketReturnType>
struct PacketWrapper<PacketReturnType, 4> {
typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type Scalar;
template <typename Index>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType &in) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType& in) {
switch (index) {
case 0:
return in.x();
@@ -174,7 +173,7 @@ struct PacketWrapper<PacketReturnType, 4> {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(Scalar in, Scalar other) {
return PacketReturnType(in, other, other, other);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType& lhs, Scalar* rhs) {
lhs = PacketReturnType(rhs[0], rhs[1], rhs[2], rhs[3]);
}
};
@@ -183,20 +182,20 @@ template <typename PacketReturnType>
struct PacketWrapper<PacketReturnType, 1> {
typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type Scalar;
template <typename Index>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index, PacketReturnType &in) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index, PacketReturnType& in) {
return in;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(Scalar in, Scalar) {
return PacketReturnType(in);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) { lhs = rhs[0]; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType& lhs, Scalar* rhs) { lhs = rhs[0]; }
};
template <typename PacketReturnType>
struct PacketWrapper<PacketReturnType, 2> {
typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type Scalar;
template <typename Index>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType &in) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType& in) {
switch (index) {
case 0:
return in.x();
@@ -213,7 +212,7 @@ struct PacketWrapper<PacketReturnType, 2> {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(Scalar in, Scalar other) {
return PacketReturnType(in, other);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType& lhs, Scalar* rhs) {
lhs = PacketReturnType(rhs[0], rhs[1]);
}
};

View File

@@ -31,13 +31,9 @@ using Packet16i = detail::VectorType<int32_t, 16>;
using Packet8l = detail::VectorType<int64_t, 8>;
// --- packet_traits specializations ---
template <>
struct packet_traits<float> : default_packet_traits {
using type = Packet16f;
using half = Packet16f;
struct generic_float_packet_traits : default_packet_traits {
enum {
Vectorizable = 1,
size = 16,
AlignedOnScalar = 1,
HasAdd = 1,
HasSub = 1,
@@ -46,7 +42,8 @@ struct packet_traits<float> : default_packet_traits {
HasNegate = 1,
HasAbs = 1,
HasRound = 1,
HasMinMax = 1,
HasMin = 1,
HasMax = 1,
HasCmp = 1,
HasSet1 = 1,
HasCast = 1,
@@ -80,12 +77,24 @@ struct packet_traits<float> : default_packet_traits {
};
template <>
struct packet_traits<double> : default_packet_traits {
struct packet_traits<float> : generic_float_packet_traits {
using type = Packet16f;
using half = Packet16f;
enum {
size = 16,
};
};
template <>
struct packet_traits<double> : generic_float_packet_traits {
using type = Packet8d;
using half = Packet8d;
enum { size = 8, HasACos = 0, HasASin = 0 };
};
struct generic_integer_packet_traits : default_packet_traits {
enum {
Vectorizable = 1,
size = 8,
AlignedOnScalar = 1,
HasAdd = 1,
HasSub = 1,
@@ -93,157 +102,85 @@ struct packet_traits<double> : default_packet_traits {
HasDiv = 1,
HasNegate = 1,
HasAbs = 1,
HasRound = 1,
HasMinMax = 1,
HasMin = 1,
HasMax = 1,
HasCmp = 1,
HasSet1 = 1,
HasCast = 1,
HasBitwise = 1,
HasRedux = 1,
HasSign = 1,
// Set remaining to 0
HasRound = 1,
HasSqrt = 0,
HasRsqrt = 0,
HasReciprocal = 0,
HasArg = 0,
HasConj = 1,
// Math functions
HasReciprocal = 1,
HasSin = 1,
HasCos = 1,
HasACos = 0,
HasASin = 0,
HasATan = 1,
HasATanh = 1,
HasLog = 1,
HasLog1p = 1,
HasExpm1 = 1,
HasExp = 1,
HasPow = 1,
HasNdtri = 1,
HasBessel = 1,
HasSqrt = 1,
HasRsqrt = 1,
HasCbrt = 1,
HasTanh = 1,
HasErf = 1,
HasErfc = 1
HasExp = 0,
HasLog = 0,
HasSin = 0,
HasCos = 0,
};
};
template <>
struct packet_traits<int32_t> : default_packet_traits {
struct packet_traits<int32_t> : generic_integer_packet_traits {
using type = Packet16i;
using half = Packet16i;
enum {
Vectorizable = 1,
size = 16,
AlignedOnScalar = 1,
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
HasAbs = 1,
HasMinMax = 1,
HasCmp = 1,
HasSet1 = 1,
HasCast = 1,
HasBitwise = 1,
HasRedux = 1,
// Set remaining to 0
HasRound = 1,
HasSqrt = 0,
HasRsqrt = 0,
HasReciprocal = 0,
HasArg = 0,
HasConj = 1,
HasExp = 0,
HasLog = 0,
HasSin = 0,
HasCos = 0,
};
};
template <>
struct packet_traits<int64_t> : default_packet_traits {
struct packet_traits<int64_t> : generic_integer_packet_traits {
using type = Packet8l;
using half = Packet8l;
enum {
Vectorizable = 1,
size = 8,
AlignedOnScalar = 1,
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
HasAbs = 1,
HasMinMax = 1,
HasCmp = 1,
HasSet1 = 1,
HasCast = 1,
HasBitwise = 1,
HasRedux = 1,
// Set remaining to 0
HasRound = 1,
HasSqrt = 0,
HasRsqrt = 0,
HasReciprocal = 0,
HasArg = 0,
HasConj = 1,
HasExp = 0,
HasLog = 0,
HasSin = 0,
HasCos = 0,
};
};
// --- unpacket_traits specializations ---
struct generic_unpacket_traits : default_unpacket_traits {
enum {
alignment = EIGEN_GENERIC_VECTOR_SIZE_BYTES,
vectorizable = true,
};
};
template <>
struct unpacket_traits<Packet16f> {
struct unpacket_traits<Packet16f> : generic_unpacket_traits {
using type = float;
using half = Packet16f;
using integer_packet = Packet16i;
enum {
size = 16,
alignment = EIGEN_GENERIC_VECTOR_SIZE_BYTES,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
};
template <>
struct unpacket_traits<Packet8d> {
struct unpacket_traits<Packet8d> : generic_unpacket_traits {
using type = double;
using half = Packet8d;
using integer_packet = Packet8l;
enum {
size = 8,
alignment = EIGEN_GENERIC_VECTOR_SIZE_BYTES,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
};
template <>
struct unpacket_traits<Packet16i> {
struct unpacket_traits<Packet16i> : generic_unpacket_traits {
using type = int32_t;
using half = Packet16i;
enum {
size = 16,
alignment = EIGEN_GENERIC_VECTOR_SIZE_BYTES,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
};
template <>
struct unpacket_traits<Packet8l> {
struct unpacket_traits<Packet8l> : generic_unpacket_traits {
using type = int64_t;
using half = Packet8l;
enum {
size = 8,
alignment = EIGEN_GENERIC_VECTOR_SIZE_BYTES,
vectorizable = true,
masked_load_available = false,
masked_store_available = false
};
};
@@ -458,8 +395,8 @@ EIGEN_CLANG_PACKET_BITWISE_FLOAT(Packet8d, detail::pcast_double_to_long, detail:
#undef EIGEN_CLANG_PACKET_BITWISE_FLOAT
// --- Min/Max operations ---
#if __has_builtin(__builtin_elementwise_min) && __has_builtin(__builtin_elementwise_max) && \
__has_builtin(__builtin_elementwise_abs)
#if EIGEN_HAS_BUILTIN(__builtin_elementwise_min) && EIGEN_HAS_BUILTIN(__builtin_elementwise_max) && \
EIGEN_HAS_BUILTIN(__builtin_elementwise_abs)
#define EIGEN_CLANG_PACKET_ELEMENTWISE(PACKET_TYPE) \
template <> \
EIGEN_STRONG_INLINE PACKET_TYPE pmin<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
@@ -494,7 +431,7 @@ EIGEN_CLANG_PACKET_BITWISE_FLOAT(Packet8d, detail::pcast_double_to_long, detail:
template <> \
EIGEN_STRONG_INLINE PACKET_TYPE pselect<PACKET_TYPE>(const PACKET_TYPE& mask, const PACKET_TYPE& a, \
const PACKET_TYPE& b) { \
return __builtin_elementwise_abs(mask) == 0 ? b : a; \
return mask != 0 ? a : b; \
}
EIGEN_CLANG_PACKET_ELEMENTWISE(Packet16f)
@@ -506,9 +443,9 @@ EIGEN_CLANG_PACKET_ELEMENTWISE(Packet8l)
// --- Math functions (float/double only) ---
#if __has_builtin(__builtin_elementwise_floor) && __has_builtin(__builtin_elementwise_ceil) && \
__has_builtin(__builtin_elementwise_round) && __has_builtin(__builtin_elementwise_roundeven) && \
__has_builtin(__builtin_elementwise_trunc) && __has_builtin(__builtin_elementwise_sqrt)
#if EIGEN_HAS_BUILTIN(__builtin_elementwise_floor) && EIGEN_HAS_BUILTIN(__builtin_elementwise_ceil) && \
EIGEN_HAS_BUILTIN(__builtin_elementwise_round) && EIGEN_HAS_BUILTIN(__builtin_elementwise_roundeven) && \
EIGEN_HAS_BUILTIN(__builtin_elementwise_trunc) && EIGEN_HAS_BUILTIN(__builtin_elementwise_sqrt)
#define EIGEN_CLANG_PACKET_MATH_FLOAT(PACKET_TYPE) \
template <> \
EIGEN_STRONG_INLINE PACKET_TYPE pfloor<PACKET_TYPE>(const PACKET_TYPE& a) { \
@@ -541,7 +478,7 @@ EIGEN_CLANG_PACKET_MATH_FLOAT(Packet8d)
#endif
// --- Fused Multiply-Add (MADD) ---
#if defined(__FMA__) && __has_builtin(__builtin_elementwise_fma)
#if defined(__FMA__) && EIGEN_HAS_BUILTIN(__builtin_elementwise_fma)
#define EIGEN_CLANG_PACKET_MADD(PACKET_TYPE) \
template <> \
EIGEN_STRONG_INLINE PACKET_TYPE pmadd<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b, \
@@ -589,11 +526,11 @@ EIGEN_CLANG_PACKET_MADD(Packet8d)
EIGEN_STRONG_INLINE PACKET_TYPE pgather<typename unpacket_traits<PACKET_TYPE>::type, PACKET_TYPE>( \
const unpacket_traits<PACKET_TYPE>::type* from, Index stride) { \
constexpr int size = unpacket_traits<PACKET_TYPE>::size; \
unpacket_traits<PACKET_TYPE>::type arr[size]; \
PACKET_TYPE result; \
for (int i = 0; i < size; ++i) { \
arr[i] = from[i * stride]; \
result[i] = from[i * stride]; \
} \
return *reinterpret_cast<PACKET_TYPE*>(arr); \
return result; \
}
EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet16f)
@@ -603,7 +540,7 @@ EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8l)
#undef EIGEN_CLANG_PACKET_SCATTER_GATHER
// ---- Various operations that depend on __builtin_shufflevector.
#if __has_builtin(__builtin_shufflevector)
#if EIGEN_HAS_BUILTIN(__builtin_shufflevector)
namespace detail {
template <typename Packet>
EIGEN_STRONG_INLINE Packet preverse_impl_8(const Packet& a) {

View File

@@ -14,7 +14,8 @@ namespace Eigen {
namespace internal {
// --- Reductions ---
#if __has_builtin(__builtin_reduce_min) && __has_builtin(__builtin_reduce_max) && __has_builtin(__builtin_reduce_or)
#if EIGEN_HAS_BUILTIN(__builtin_reduce_min) && EIGEN_HAS_BUILTIN(__builtin_reduce_max) && \
EIGEN_HAS_BUILTIN(__builtin_reduce_or)
#define EIGEN_CLANG_PACKET_REDUX_MINMAX(PACKET_TYPE) \
template <> \
EIGEN_STRONG_INLINE unpacket_traits<PACKET_TYPE>::type predux_min(const PACKET_TYPE& a) { \
@@ -36,7 +37,7 @@ EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet8l)
#undef EIGEN_CLANG_PACKET_REDUX_MINMAX
#endif
#if __has_builtin(__builtin_reduce_add) && __has_builtin(__builtin_reduce_mul)
#if EIGEN_HAS_BUILTIN(__builtin_reduce_add) && EIGEN_HAS_BUILTIN(__builtin_reduce_mul)
#define EIGEN_CLANG_PACKET_REDUX_INT(PACKET_TYPE) \
template <> \
EIGEN_STRONG_INLINE unpacket_traits<PACKET_TYPE>::type predux<PACKET_TYPE>(const PACKET_TYPE& a) { \
@@ -53,7 +54,7 @@ EIGEN_CLANG_PACKET_REDUX_INT(Packet8l)
#undef EIGEN_CLANG_PACKET_REDUX_INT
#endif
#if __has_builtin(__builtin_shufflevector)
#if EIGEN_HAS_BUILTIN(__builtin_shufflevector)
namespace detail {
template <typename VectorT>
EIGEN_STRONG_INLINE scalar_type_of_vector_t<VectorT> ReduceAdd16(const VectorT& a) {

View File

@@ -37,7 +37,7 @@ EIGEN_STRONG_INLINE Packet8l preinterpret<Packet8l, Packet8d>(const Packet8d& a)
//==============================================================================
// pcast
//==============================================================================
#if __has_builtin(__builtin_convertvector)
#if EIGEN_HAS_BUILTIN(__builtin_convertvector)
template <>
EIGEN_STRONG_INLINE Packet16i pcast<Packet16f, Packet16i>(const Packet16f& a) {
return __builtin_convertvector(a, Packet16i);

View File

@@ -106,26 +106,6 @@ struct functor_traits<scalar_abs2_op<Scalar>> {
};
};
template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
struct squared_norm_functor {
typedef Scalar result_type;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const {
return Scalar(numext::real(a) * numext::real(a), numext::imag(a) * numext::imag(a));
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
return Packet(pmul(a.v, a.v));
}
};
template <typename Scalar>
struct squared_norm_functor<Scalar, false> : scalar_abs2_op<Scalar> {};
template <typename Scalar>
struct functor_traits<squared_norm_functor<Scalar>> {
using Real = typename NumTraits<Scalar>::Real;
enum { Cost = NumTraits<Real>::MulCost, PacketAccess = packet_traits<Real>::HasMul };
};
/** \internal
* \brief Template functor to compute the conjugate of a complex value
*

View File

@@ -517,6 +517,9 @@ struct eigen_zero_impl;
template <typename Packet>
struct has_packet_segment : std::false_type {};
template <typename T>
struct complex_array_access;
} // namespace internal
} // end namespace Eigen

View File

@@ -700,6 +700,13 @@
#define EIGEN_HAS_BUILTIN(x) 0
#endif
// Cross compiler wrapper around LLVM's __has_attribute
#ifdef __has_attribute
#define EIGEN_HAS_ATTRIBUTE(x) __has_attribute(x)
#else
#define EIGEN_HAS_ATTRIBUTE(x) 0
#endif
// A Clang feature extension to determine compiler features.
// We use it to determine 'cxx_rvalue_references'
#ifndef __has_feature
@@ -831,7 +838,7 @@
#endif
// Does the compiler support vector types?
#if __has_attribute(ext_vector_type) && __has_builtin(__builtin_vectorelements)
#if EIGEN_HAS_ATTRIBUTE(ext_vector_type) && EIGEN_HAS_BUILTIN(__builtin_vectorelements)
#define EIGEN_ARCH_VECTOR_EXTENSIONS 1
#else
#define EIGEN_ARCH_VECTOR_EXTENSIONS 0

View File

@@ -720,6 +720,7 @@ JacobiSVD<MatrixType, Options>& JacobiSVD<MatrixType, Options>::compute_impl(con
m_isInitialized = true;
m_info = InvalidInput;
m_nonzeroSingularValues = 0;
m_singularValues.setZero();
return *this;
}
if (numext::is_exactly_zero(scale)) scale = RealScalar(1);

View File

@@ -30,12 +30,10 @@ EIGEN_DONT_INLINE Scalar check_in_range(Scalar x, Scalar y) {
template <typename Scalar>
void check_all_in_range(Scalar x, Scalar y) {
constexpr int repeats = 32;
uint64_t count = static_cast<uint64_t>(y) - static_cast<uint64_t>(x) + 1;
Index count = static_cast<Index>(y) - static_cast<Index>(x) + 1;
ArrayX<bool> mask(count);
// ensure that `count` does not overflow the return type of `mask.size()`
VERIFY(count == static_cast<uint64_t>(mask.size()));
mask.setConstant(false);
for (uint64_t k = 0; k < count; k++)
for (Index k = 0; k < count; k++)
for (int repeat = 0; repeat < repeats; repeat++) {
Scalar r = check_in_range(x, y);
Index i = static_cast<Index>(r) - static_cast<Index>(x);

View File

@@ -9,8 +9,20 @@
#include "main.h"
// wrapper that disables array-oriented access to real and imaginary components
struct TestComplex : public std::complex<float> {
TestComplex() = default;
TestComplex(const TestComplex&) = default;
TestComplex(std::complex<float> x) : std::complex<float>(x){};
TestComplex(float x) : std::complex<float>(x){};
};
template <>
struct NumTraits<TestComplex> : NumTraits<std::complex<float>> {};
template <>
struct internal::random_impl<TestComplex> : internal::random_impl<std::complex<float>> {};
template <typename T>
void test_realview(const T&) {
void test_realview_readonly(const T&) {
using Scalar = typename T::Scalar;
using RealScalar = typename NumTraits<Scalar>::Real;
@@ -26,21 +38,16 @@ void test_realview(const T&) {
Index rows = internal::random<Index>(minRows, maxRows);
Index cols = internal::random<Index>(minCols, maxCols);
T A(rows, cols), B, C;
T A(rows, cols), B(rows, cols);
VERIFY(A.realView().rows() == rowFactor * A.rows());
VERIFY(A.realView().cols() == colFactor * A.cols());
VERIFY(A.realView().size() == sizeFactor * A.size());
RealScalar alpha = internal::random(RealScalar(1), RealScalar(2));
A.setRandom();
VERIFY_IS_APPROX(A.matrix().cwiseAbs2().sum(), A.realView().matrix().cwiseAbs2().sum());
VERIFY_IS_APPROX(A.matrix().squaredNorm(), A.realView().matrix().squaredNorm());
// test re-sizing realView during assignment
B.realView() = A.realView();
VERIFY_IS_APPROX(A, B);
VERIFY_IS_APPROX(A.realView(), B.realView());
RealScalar alpha = internal::random(RealScalar(1), RealScalar(2));
// B = A * alpha
for (Index r = 0; r < rows; r++) {
@@ -48,14 +55,7 @@ void test_realview(const T&) {
B.coeffRef(r, c) = A.coeff(r, c) * Scalar(alpha);
}
}
VERIFY_IS_APPROX(B.realView(), A.realView() * alpha);
C = A;
C.realView() *= alpha;
VERIFY_IS_APPROX(B, C);
alpha = internal::random(RealScalar(1), RealScalar(2));
A.setRandom();
VERIFY_IS_CWISE_APPROX(B.realView(), A.realView() * alpha);
// B = A / alpha
for (Index r = 0; r < rows; r++) {
@@ -63,15 +63,155 @@ void test_realview(const T&) {
B.coeffRef(r, c) = A.coeff(r, c) / Scalar(alpha);
}
}
VERIFY_IS_CWISE_APPROX(B.realView(), A.realView() / alpha);
}
template <typename T>
void test_realview(const T&) {
using Scalar = typename T::Scalar;
using RealScalar = typename NumTraits<Scalar>::Real;
constexpr Index minRows = T::RowsAtCompileTime == Dynamic ? 1 : T::RowsAtCompileTime;
constexpr Index maxRows = T::MaxRowsAtCompileTime == Dynamic ? (EIGEN_TEST_MAX_SIZE / 2) : T::MaxRowsAtCompileTime;
constexpr Index minCols = T::ColsAtCompileTime == Dynamic ? 1 : T::ColsAtCompileTime;
constexpr Index maxCols = T::MaxColsAtCompileTime == Dynamic ? (EIGEN_TEST_MAX_SIZE / 2) : T::MaxColsAtCompileTime;
constexpr Index rowFactor = (NumTraits<Scalar>::IsComplex && !T::IsRowMajor) ? 2 : 1;
constexpr Index colFactor = (NumTraits<Scalar>::IsComplex && T::IsRowMajor) ? 2 : 1;
constexpr Index sizeFactor = NumTraits<Scalar>::IsComplex ? 2 : 1;
const Index rows = internal::random<Index>(minRows, maxRows);
const Index cols = internal::random<Index>(minCols, maxCols);
const Index realViewRows = rowFactor * rows;
const Index realViewCols = colFactor * cols;
const T A = T::Random(rows, cols);
T B;
VERIFY_IS_EQUAL(A.realView().rows(), rowFactor * A.rows());
VERIFY_IS_EQUAL(A.realView().cols(), colFactor * A.cols());
VERIFY_IS_EQUAL(A.realView().size(), sizeFactor * A.size());
VERIFY_IS_APPROX(A.matrix().cwiseAbs2().sum(), A.realView().matrix().cwiseAbs2().sum());
// test re-sizing realView during assignment
B.realView() = A.realView();
VERIFY_IS_APPROX(A, B);
VERIFY_IS_APPROX(A.realView(), B.realView());
const RealScalar alpha = internal::random(RealScalar(1), RealScalar(2));
// B = A * alpha
for (Index r = 0; r < rows; r++) {
for (Index c = 0; c < cols; c++) {
B.coeffRef(r, c) = A.coeff(r, c) * Scalar(alpha);
}
}
VERIFY_IS_APPROX(B.realView(), A.realView() * alpha);
B = A;
B.realView() *= alpha;
VERIFY_IS_APPROX(B.realView(), A.realView() * alpha);
// B = A / alpha
for (Index r = 0; r < rows; r++) {
for (Index c = 0; c < cols; c++) {
B.coeffRef(r, c) = A.coeff(r, c) / Scalar(alpha);
}
}
VERIFY_IS_APPROX(B.realView(), A.realView() / alpha);
B = A;
B.realView() /= alpha;
VERIFY_IS_APPROX(B.realView(), A.realView() / alpha);
// force some usual access patterns
Index malloc_size = (rows * cols * sizeof(Scalar)) + sizeof(RealScalar);
void* data1 = internal::aligned_malloc(malloc_size);
void* data2 = internal::aligned_malloc(malloc_size);
Scalar* ptr1 = reinterpret_cast<Scalar*>(reinterpret_cast<uint8_t*>(data1) + sizeof(RealScalar));
Scalar* ptr2 = reinterpret_cast<Scalar*>(reinterpret_cast<uint8_t*>(data2) + sizeof(RealScalar));
Map<T> C(ptr1, rows, cols), D(ptr2, rows, cols);
C.setRandom();
D.setRandom();
for (Index r = 0; r < realViewRows; r++) {
for (Index c = 0; c < realViewCols; c++) {
C.realView().coeffRef(r, c) = D.realView().coeff(r, c);
}
}
VERIFY_IS_CWISE_EQUAL(C, D);
C = A;
C.realView() /= alpha;
VERIFY_IS_APPROX(B, C);
for (Index c = 0; c < realViewCols - 1; c++) {
B.realView().row(0).coeffRef(realViewCols - 1 - c) = C.realView().row(0).coeff(c + 1);
}
D.realView().row(0).tail(realViewCols - 1) = C.realView().row(0).tail(realViewCols - 1).reverse();
VERIFY_IS_CWISE_EQUAL(B.realView().row(0).tail(realViewCols - 1), D.realView().row(0).tail(realViewCols - 1));
for (Index r = 0; r < realViewRows - 1; r++) {
B.realView().col(0).coeffRef(realViewRows - 1 - r) = C.realView().col(0).coeff(r + 1);
}
D.realView().col(0).tail(realViewRows - 1) = C.realView().col(0).tail(realViewRows - 1).reverse();
VERIFY_IS_CWISE_EQUAL(B.realView().col(0).tail(realViewRows - 1), D.realView().col(0).tail(realViewRows - 1));
}
template <typename ComplexScalar, bool Enable = internal::packet_traits<ComplexScalar>::Vectorizable>
struct test_edge_cases_impl {
static void run() {
using namespace internal;
using RealScalar = typename NumTraits<ComplexScalar>::Real;
using ComplexPacket = typename packet_traits<ComplexScalar>::type;
using RealPacket = typename unpacket_traits<ComplexPacket>::as_real;
constexpr int ComplexSize = unpacket_traits<ComplexPacket>::size;
constexpr int RealSize = 2 * ComplexSize;
VectorX<ComplexScalar> a_data(2 * ComplexSize);
Map<const VectorX<RealScalar>> a_data_asreal(reinterpret_cast<const RealScalar*>(a_data.data()), 2 * a_data.size());
VectorX<RealScalar> b_data(RealSize);
a_data.setRandom();
evaluator<RealView<VectorX<ComplexScalar>>> eval(a_data.realView());
for (Index offset = 0; offset < RealSize; offset++) {
for (Index begin = 0; offset + begin < RealSize; begin++) {
for (Index count = 0; begin + count < RealSize; count++) {
b_data.setRandom();
RealPacket res = eval.template packetSegment<Unaligned, RealPacket>(offset, begin, count);
pstoreSegment(b_data.data(), res, begin, count);
VERIFY_IS_CWISE_EQUAL(a_data_asreal.segment(offset + begin, count), b_data.segment(begin, count));
}
}
}
}
};
template <typename ComplexScalar>
struct test_edge_cases_impl<ComplexScalar, false> {
static void run() {}
};
template <typename ComplexScalar>
void test_edge_cases(const ComplexScalar&) {
test_edge_cases_impl<ComplexScalar>::run();
}
template <typename Scalar, int Rows, int Cols, int MaxRows = Rows, int MaxCols = Cols>
void test_realview_driver() {
void test_realview_readonly() {
// if Rows == 1, don't test ColMajor as it is not a valid array
using ColMajorMatrixType = Matrix<Scalar, Rows, Cols, Rows == 1 ? RowMajor : ColMajor, MaxRows, MaxCols>;
using ColMajorArrayType = Array<Scalar, Rows, Cols, Rows == 1 ? RowMajor : ColMajor, MaxRows, MaxCols>;
// if Cols == 1, don't test RowMajor as it is not a valid array
using RowMajorMatrixType = Matrix<Scalar, Rows, Cols, Cols == 1 ? ColMajor : RowMajor, MaxRows, MaxCols>;
using RowMajorArrayType = Array<Scalar, Rows, Cols, Cols == 1 ? ColMajor : RowMajor, MaxRows, MaxCols>;
test_realview_readonly(ColMajorMatrixType());
test_realview_readonly(ColMajorArrayType());
test_realview_readonly(RowMajorMatrixType());
test_realview_readonly(RowMajorArrayType());
}
template <typename Scalar, int Rows, int Cols, int MaxRows = Rows, int MaxCols = Cols>
void test_realview_readwrite() {
// if Rows == 1, don't test ColMajor as it is not a valid array
using ColMajorMatrixType = Matrix<Scalar, Rows, Cols, Rows == 1 ? RowMajor : ColMajor, MaxRows, MaxCols>;
using ColMajorArrayType = Array<Scalar, Rows, Cols, Rows == 1 ? RowMajor : ColMajor, MaxRows, MaxCols>;
@@ -85,26 +225,29 @@ void test_realview_driver() {
}
template <int Rows, int Cols, int MaxRows = Rows, int MaxCols = Cols>
void test_realview_driver_complex() {
test_realview_driver<float, Rows, Cols, MaxRows, MaxCols>();
test_realview_driver<std::complex<float>, Rows, Cols, MaxRows, MaxCols>();
test_realview_driver<double, Rows, Cols, MaxRows, MaxCols>();
test_realview_driver<std::complex<double>, Rows, Cols, MaxRows, MaxCols>();
test_realview_driver<long double, Rows, Cols, MaxRows, MaxCols>();
test_realview_driver<std::complex<long double>, Rows, Cols, MaxRows, MaxCols>();
void test_realview() {
test_realview_readwrite<float, Rows, Cols, MaxRows, MaxCols>();
test_realview_readwrite<std::complex<float>, Rows, Cols, MaxRows, MaxCols>();
test_realview_readwrite<double, Rows, Cols, MaxRows, MaxCols>();
test_realview_readwrite<std::complex<double>, Rows, Cols, MaxRows, MaxCols>();
test_realview_readwrite<long double, Rows, Cols, MaxRows, MaxCols>();
test_realview_readwrite<std::complex<long double>, Rows, Cols, MaxRows, MaxCols>();
test_realview_readonly<TestComplex, Rows, Cols, MaxRows, MaxCols>();
}
EIGEN_DECLARE_TEST(realview) {
for (int i = 0; i < g_repeat; i++) {
CALL_SUBTEST_1((test_realview_driver_complex<Dynamic, Dynamic, Dynamic, Dynamic>()));
CALL_SUBTEST_2((test_realview_driver_complex<Dynamic, Dynamic, 17, Dynamic>()));
CALL_SUBTEST_3((test_realview_driver_complex<Dynamic, Dynamic, Dynamic, 19>()));
CALL_SUBTEST_4((test_realview_driver_complex<Dynamic, Dynamic, 17, 19>()));
CALL_SUBTEST_5((test_realview_driver_complex<17, Dynamic, 17, Dynamic>()));
CALL_SUBTEST_6((test_realview_driver_complex<Dynamic, 19, Dynamic, 19>()));
CALL_SUBTEST_7((test_realview_driver_complex<17, 19, 17, 19>()));
CALL_SUBTEST_8((test_realview_driver_complex<Dynamic, 1>()));
CALL_SUBTEST_9((test_realview_driver_complex<1, Dynamic>()));
CALL_SUBTEST_10((test_realview_driver_complex<1, 1>()));
CALL_SUBTEST_1((test_realview<Dynamic, Dynamic, Dynamic, Dynamic>()));
CALL_SUBTEST_2((test_realview<Dynamic, Dynamic, 17, Dynamic>()));
CALL_SUBTEST_3((test_realview<Dynamic, Dynamic, Dynamic, 19>()));
CALL_SUBTEST_4((test_realview<Dynamic, Dynamic, 17, 19>()));
CALL_SUBTEST_5((test_realview<17, Dynamic, 17, Dynamic>()));
CALL_SUBTEST_6((test_realview<Dynamic, 19, Dynamic, 19>()));
CALL_SUBTEST_7((test_realview<17, 19, 17, 19>()));
CALL_SUBTEST_8((test_realview<Dynamic, 1>()));
CALL_SUBTEST_9((test_realview<1, Dynamic>()));
CALL_SUBTEST_10((test_realview<1, 1>()));
CALL_SUBTEST_11(test_edge_cases(std::complex<float>()));
CALL_SUBTEST_12(test_edge_cases(std::complex<double>()));
}
}

View File

@@ -26,7 +26,7 @@ static void test_parallel_for(int granularity) {
TestData test_data = make_test_data(/*num_threads=*/4, kNumTasks);
std::atomic<uint64_t> sum(0);
std::function<void(Index, Index)> binary_do_fn = [&](Index i, Index j) {
for (int k = i; k < j; ++k)
for (Index k = i; k < j; ++k)
for (uint64_t new_sum = sum; !sum.compare_exchange_weak(new_sum, new_sum + test_data.data[k]);) {
};
};

View File

@@ -312,6 +312,9 @@ static void test_select() {
}
}
// Nan propagation does currently not work like one would expect from std::max/std::min,
// so we disable it for now
#if !EIGEN_ARCH_ARM_OR_ARM64
template <typename Scalar>
void test_minmax_nan_propagation_templ() {
for (int size = 1; size < 17; ++size) {
@@ -430,6 +433,10 @@ void test_minmax_nan_propagation_templ() {
VERIFY_IS_EQUAL(val(), (size == 1 ? -kInf : kZero));
}
}
#else
template <typename Scalar>
void test_minmax_nan_propagation_templ() {}
#endif
static void test_clip() {
Tensor<float, 1> vec(6);
@@ -465,10 +472,5 @@ EIGEN_DECLARE_TEST(cxx11_tensor_expr) {
CALL_SUBTEST(test_type_casting());
CALL_SUBTEST(test_select());
CALL_SUBTEST(test_clip());
// Nan propagation does currently not work like one would expect from std::max/std::min,
// so we disable it for now
#if !EIGEN_ARCH_ARM_OR_ARM64
CALL_SUBTEST(test_minmax_nan_propagation());
#endif
}