mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-01-18 17:31:19 +01:00
Merge remote-tracking branch 'origin2/master'
This commit is contained in:
107
CMakeLists.txt
107
CMakeLists.txt
@@ -34,6 +34,11 @@ if (POLICY CMP0177)
|
||||
cmake_policy(SET CMP0177 NEW)
|
||||
endif ()
|
||||
|
||||
# Respect <PackageName>_ROOT variables.
|
||||
if (POLICY CMP0074)
|
||||
cmake_policy(SET CMP0074 NEW)
|
||||
endif ()
|
||||
|
||||
#==============================================================================
|
||||
# CMake Project.
|
||||
#==============================================================================
|
||||
@@ -71,6 +76,7 @@ endif()
|
||||
|
||||
option(EIGEN_BUILD_BTL "Build benchmark suite" OFF)
|
||||
option(EIGEN_BUILD_SPBENCH "Build sparse benchmark suite" OFF)
|
||||
option(EIGEN_BUILD_AOCL_BENCH "Build AOCL benchmark" OFF)
|
||||
# Avoid building docs if included from another project.
|
||||
# Building documentation requires creating and running executables on the host
|
||||
# platform. We shouldn't do this if cross-compiling.
|
||||
@@ -100,19 +106,19 @@ endif()
|
||||
file(READ "${PROJECT_SOURCE_DIR}/Eigen/Version" _eigen_version_header)
|
||||
if (NOT DEFINED EIGEN_WORLD_VERSION)
|
||||
string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen_world_version_match "${_eigen_version_header}")
|
||||
set(EIGEN_WORLD_VERSION "${CMAKE_MATCH_1}")
|
||||
set(EIGEN_WORLD_VERSION "${CMAKE_MATCH_1}" CACHE STRING "")
|
||||
endif()
|
||||
if (NOT DEFINED EIGEN_MAJOR_VERSION)
|
||||
string(REGEX MATCH "define[ \t]+EIGEN_MAJOR_VERSION[ \t]+([0-9]+)" _eigen_major_version_match "${_eigen_version_header}")
|
||||
set(EIGEN_MAJOR_VERSION "${CMAKE_MATCH_1}")
|
||||
set(EIGEN_MAJOR_VERSION "${CMAKE_MATCH_1}" CACHE STRING "")
|
||||
endif()
|
||||
if (NOT DEFINED EIGEN_MINOR_VERSION)
|
||||
string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen_minor_version_match "${_eigen_version_header}")
|
||||
set(EIGEN_MINOR_VERSION "${CMAKE_MATCH_1}")
|
||||
set(EIGEN_MINOR_VERSION "${CMAKE_MATCH_1}" CACHE STRING "")
|
||||
endif()
|
||||
if (NOT DEFINED EIGEN_PATCH_VERSION)
|
||||
string(REGEX MATCH "define[ \t]+EIGEN_PATCH_VERSION[ \t]+([0-9]+)" _eigen_patch_version_match "${_eigen_version_header}")
|
||||
set(EIGEN_PATCH_VERSION "${CMAKE_MATCH_1}")
|
||||
set(EIGEN_PATCH_VERSION "${CMAKE_MATCH_1}" CACHE STRING "")
|
||||
endif()
|
||||
if (NOT DEFINED EIGEN_PRERELEASE_VERSION)
|
||||
set(EIGEN_PRERELEASE_VERSION "dev")
|
||||
@@ -134,18 +140,18 @@ endif()
|
||||
if (NOT DEFINED EIGEN_BUILD_VERSION AND DEFINED EIGEN_GIT_REVNUM)
|
||||
string(SUBSTRING "${EIGEN_GIT_REVNUM}" 0 8 EIGEN_BUILD_VERSION)
|
||||
else()
|
||||
set(EIGEN_BUILD_VERSION "")
|
||||
set(EIGEN_BUILD_VERSION "" CACHE STRING "")
|
||||
endif()
|
||||
|
||||
# The EIGEN_VERSION_NUMBER must be of the form <major.minor.patch>.
|
||||
# The EIGEN_VERSION_STRING can contain the preprelease/build strings.
|
||||
set(EIGEN_VERSION_NUMBER "${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION}.${EIGEN_PATCH_VERSION}")
|
||||
set(EIGEN_VERSION_STRING "${EIGEN_VERSION_NUMBER}")
|
||||
set(EIGEN_VERSION_NUMBER "${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION}.${EIGEN_PATCH_VERSION}" CACHE STRING "")
|
||||
set(EIGEN_VERSION_STRING "${EIGEN_VERSION_NUMBER}" CACHE STRING "")
|
||||
if (NOT "x${EIGEN_PRERELEASE_VERSION}" STREQUAL "x")
|
||||
set(EIGEN_VERSION_STRING "${EIGEN_VERSION_STRING}-${EIGEN_PRERELEASE_VERSION}")
|
||||
set(EIGEN_VERSION_STRING "${EIGEN_VERSION_STRING}-${EIGEN_PRERELEASE_VERSION}" CACHE STRING "")
|
||||
endif()
|
||||
if (NOT "x${EIGEN_BUILD_VERSION}" STREQUAL "x")
|
||||
set(EIGEN_VERSION_STRING "${EIGEN_VERSION_STRING}+${EIGEN_BUILD_VERSION}")
|
||||
set(EIGEN_VERSION_STRING "${EIGEN_VERSION_STRING}+${EIGEN_BUILD_VERSION}" CACHE STRING "")
|
||||
endif()
|
||||
|
||||
|
||||
@@ -305,17 +311,29 @@ if (EIGEN_IS_BUILDING_)
|
||||
set(CMAKE_INCLUDE_CURRENT_DIR OFF)
|
||||
|
||||
find_package(StandardMathLibrary)
|
||||
find_package(AOCL QUIET)
|
||||
set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "")
|
||||
if(NOT STANDARD_MATH_LIBRARY_FOUND)
|
||||
message(FATAL_ERROR
|
||||
"Can't link to the standard math library. Please report to the Eigen developers, telling them about your platform.")
|
||||
else()
|
||||
if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
|
||||
set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO} ${STANDARD_MATH_LIBRARY}")
|
||||
else()
|
||||
set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${STANDARD_MATH_LIBRARY}")
|
||||
if(AOCL_FOUND)
|
||||
list(APPEND EIGEN_STANDARD_LIBRARIES_TO_LINK_TO ${AOCL_LIBRARIES})
|
||||
if(AOCL_INCLUDE_DIRS)
|
||||
include_directories(${AOCL_INCLUDE_DIRS})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(NOT STANDARD_MATH_LIBRARY_FOUND)
|
||||
message(FATAL_ERROR
|
||||
"Can't link to the standard math library. Please report to the Eigen developers, telling them about your platform.")
|
||||
else()
|
||||
if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
|
||||
set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO} ${STANDARD_MATH_LIBRARY}")
|
||||
else()
|
||||
set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${STANDARD_MATH_LIBRARY}")
|
||||
endif()
|
||||
# Clean up any leading/trailing whitespace in the variable to avoid CMP0004 errors
|
||||
string(STRIP "${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}" EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
|
||||
endif()
|
||||
|
||||
|
||||
if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
|
||||
message(STATUS "Standard libraries to link to explicitly: ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}")
|
||||
else()
|
||||
@@ -743,6 +761,57 @@ endif()
|
||||
if(NOT WIN32 AND EIGEN_BUILD_SPBENCH)
|
||||
add_subdirectory(bench/spbench EXCLUDE_FROM_ALL)
|
||||
endif()
|
||||
#--------------------------------------------------------------------------------------#
|
||||
# AOCL BENCHMARK BUILD SECTION #
|
||||
#--------------------------------------------------------------------------------------#
|
||||
if(EIGEN_BUILD_AOCL_BENCH)
|
||||
# Allow users to override the default architecture
|
||||
set(EIGEN_AOCL_BENCH_ARCH "znver5" CACHE STRING "Target architecture for AOCL benchmark")
|
||||
add_executable(benchmark_aocl EXCLUDE_FROM_ALL bench/benchmark_aocl.cpp)
|
||||
include(CheckCXXCompilerFlag)
|
||||
check_cxx_compiler_flag("-march=${EIGEN_AOCL_BENCH_ARCH}" COMPILER_SUPPORTS_AOCL_ARCH)
|
||||
if(COMPILER_SUPPORTS_AOCL_ARCH)
|
||||
target_compile_options(benchmark_aocl PRIVATE -O3 -Wno-shadow -march=${EIGEN_AOCL_BENCH_ARCH})
|
||||
else()
|
||||
message(WARNING "${EIGEN_AOCL_BENCH_ARCH} architecture not supported by compiler")
|
||||
target_compile_options(benchmark_aocl PRIVATE -O3)
|
||||
endif()
|
||||
|
||||
# Add custom flags if provided
|
||||
if(EIGEN_AOCL_BENCH_FLAGS)
|
||||
separate_arguments(CUSTOM_FLAGS NATIVE_COMMAND "${EIGEN_AOCL_BENCH_FLAGS}")
|
||||
target_compile_options(benchmark_aocl PRIVATE ${CUSTOM_FLAGS})
|
||||
# Check if OpenMP is requested in custom flags and link it
|
||||
string(FIND "${EIGEN_AOCL_BENCH_FLAGS}" "-fopenmp" OPENMP_REQUESTED)
|
||||
if(NOT OPENMP_REQUESTED EQUAL -1)
|
||||
find_package(OpenMP)
|
||||
if(OpenMP_CXX_FOUND)
|
||||
target_link_libraries(benchmark_aocl OpenMP::OpenMP_CXX)
|
||||
else()
|
||||
# Generic fallback: let compiler handle OpenMP linking
|
||||
if(MSVC)
|
||||
target_compile_options(benchmark_aocl PRIVATE "/openmp")
|
||||
else()
|
||||
target_compile_options(benchmark_aocl PRIVATE "-fopenmp")
|
||||
target_link_options(benchmark_aocl PRIVATE "-fopenmp")
|
||||
endif()
|
||||
message(STATUS "Using compiler OpenMP flags as fallback")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
target_include_directories(benchmark_aocl PRIVATE ${INCLUDE_INSTALL_DIR})
|
||||
if(EIGEN_AOCL_BENCH_USE_MT)
|
||||
target_compile_definitions(benchmark_aocl PRIVATE EIGEN_USE_AOCL_MT)
|
||||
else()
|
||||
target_compile_definitions(benchmark_aocl PRIVATE EIGEN_USE_AOCL_ALL)
|
||||
endif()
|
||||
target_link_libraries(benchmark_aocl Eigen3::Eigen)
|
||||
if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
|
||||
target_link_libraries(benchmark_aocl ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
|
||||
endif()
|
||||
endif()
|
||||
#----------------------------------------------------------------------------------------#
|
||||
|
||||
if (EIGEN_BUILD_DEMOS)
|
||||
add_subdirectory(demos EXCLUDE_FROM_ALL)
|
||||
@@ -792,6 +861,9 @@ if(PROJECT_IS_TOP_LEVEL)
|
||||
if (EIGEN_BUILD_LAPACK)
|
||||
message(STATUS "lapack | Build LAPACK subset library (not the same thing as Eigen)")
|
||||
endif()
|
||||
if(EIGEN_BUILD_AOCL_BENCH)
|
||||
message(STATUS "benchmark_aocl | Build AOCL benchmark executable")
|
||||
endif()
|
||||
message(STATUS "------------+--------------------------------------------------------------")
|
||||
message(STATUS "")
|
||||
endif()
|
||||
@@ -799,3 +871,4 @@ endif()
|
||||
message(STATUS "")
|
||||
message(STATUS "Configured Eigen ${EIGEN_VERSION_STRING}")
|
||||
message(STATUS "")
|
||||
|
||||
|
||||
@@ -53,6 +53,8 @@
|
||||
// this include file manages BLAS and MKL related macros
|
||||
// and inclusion of their respective header files
|
||||
#include "src/Core/util/MKL_support.h"
|
||||
#include "src/Core/util/AOCL_Support.h" // ← ADD THIS
|
||||
|
||||
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
|
||||
#define EIGEN_HAS_GPU_FP16
|
||||
@@ -463,6 +465,10 @@ using std::ptrdiff_t;
|
||||
#include "src/Core/Assign_MKL.h"
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN_USE_AOCL_VML
|
||||
#include "src/Core/Assign_AOCL.h"
|
||||
#endif
|
||||
|
||||
#include "src/Core/GlobalFunctions.h"
|
||||
// IWYU pragma: end_exports
|
||||
|
||||
|
||||
301
Eigen/src/Core/Assign_AOCL.h
Normal file
301
Eigen/src/Core/Assign_AOCL.h
Normal file
@@ -0,0 +1,301 @@
|
||||
/*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
||||
*
|
||||
* Assign_AOCL.h - AOCL Vectorized Math Dispatch Layer for Eigen
|
||||
*
|
||||
* Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Description:
|
||||
* ------------
|
||||
* This file implements a high-performance dispatch layer that automatically
|
||||
* routes Eigen's element-wise mathematical operations to AMD Optimizing CPU
|
||||
* Libraries (AOCL) Vector Math Library (VML) functions when beneficial for
|
||||
* performance.
|
||||
*
|
||||
* The dispatch system uses C++ template specialization to intercept Eigen's
|
||||
* assignment operations and redirect them to AOCL's VRDA functions, which
|
||||
* provide optimized implementations for AMD Zen architectures.
|
||||
*
|
||||
* Key Features:
|
||||
* -------------
|
||||
* 1. Automatic Dispatch: Seamlessly routes supported operations to AOCL without
|
||||
* requiring code changes in user applications
|
||||
*
|
||||
* 2. Performance Optimization: Uses AOCL VRDA functions optimized for Zen
|
||||
* family processors with automatic SIMD instruction selection (AVX2, AVX-512)
|
||||
*
|
||||
* 3. Threshold-Based Activation: Only activates for vectors larger than
|
||||
* EIGEN_AOCL_VML_THRESHOLD (default: 128 elements) to avoid overhead on
|
||||
* small vectors
|
||||
*
|
||||
* 4. Precision-Specific Handling:
|
||||
* - Double precision: AOCL VRDA vectorized functions
|
||||
* - Single precision: Scalar fallback (preserves correctness)
|
||||
*
|
||||
* 5. Memory Layout Compatibility: Ensures direct memory access and compatible
|
||||
* storage orders between source and destination for optimal performance
|
||||
*
|
||||
* Supported Operations:
|
||||
* ---------------------
|
||||
* UNARY OPERATIONS (vector → vector):
|
||||
* - Transcendental: exp(), sin(), cos(), sqrt(), log(), log10(), log2()
|
||||
*
|
||||
* BINARY OPERATIONS (vector op vector → vector):
|
||||
* - Arithmetic: +, *, pow()
|
||||
*
|
||||
* Template Specialization Mechanism:
|
||||
* -----------------------------------
|
||||
* The system works by specializing Eigen's Assignment template for:
|
||||
* 1. CwiseUnaryOp with scalar_*_op functors (unary operations)
|
||||
* 2. CwiseBinaryOp with scalar_*_op functors (binary operations)
|
||||
* 3. Dense2Dense assignment context with AOCL-compatible traits
|
||||
*
|
||||
* Dispatch conditions (all must be true):
|
||||
* - Source and destination have DirectAccessBit (contiguous memory)
|
||||
* - Compatible storage orders (both row-major or both column-major)
|
||||
* - Vector size ≥ EIGEN_AOCL_VML_THRESHOLD or Dynamic size
|
||||
* - Supported data type (currently double precision for VRDA)
|
||||
*
|
||||
* Integration Example:
|
||||
* --------------------
|
||||
* // Standard Eigen code - no changes required
|
||||
* VectorXd x = VectorXd::Random(10000);
|
||||
* VectorXd y = VectorXd::Random(10000);
|
||||
* VectorXd result;
|
||||
*
|
||||
* // These operations are automatically dispatched to AOCL:
|
||||
* result = x.array().exp(); // → amd_vrda_exp()
|
||||
* result = x.array().sin(); // → amd_vrda_sin()
|
||||
* result = x.array() + y.array(); // → amd_vrda_add()
|
||||
* result = x.array().pow(y.array()); // → amd_vrda_pow()
|
||||
*
|
||||
* Configuration:
|
||||
* --------------
|
||||
* Required preprocessor definitions:
|
||||
* - EIGEN_USE_AOCL_ALL or EIGEN_USE_AOCL_MT: Enable AOCL integration
|
||||
* - EIGEN_USE_AOCL_VML: Enable Vector Math Library dispatch
|
||||
*
|
||||
* Compilation Requirements:
|
||||
* -------------------------
|
||||
* Include paths:
|
||||
* - AOCL headers: -I${AOCL_ROOT}/include
|
||||
* - Eigen headers: -I/path/to/eigen
|
||||
*
|
||||
* Link libraries:
|
||||
* - AOCL MathLib: -lamdlibm
|
||||
* - Standard math: -lm
|
||||
*
|
||||
* Compiler flags:
|
||||
* - Optimization: -O3 (required for inlining)
|
||||
* - Architecture: -march=znver5 or -march=native
|
||||
* - Vectorization: -mfma -mavx512f (if supported)
|
||||
*
|
||||
* Platform Support:
|
||||
* ------------------
|
||||
* - Primary: Linux x86_64 with AMD Zen family processors
|
||||
* - Compilers: GCC 8+, Clang 10+, AOCC (recommended)
|
||||
* - AOCL Version: 4.0+ (with VRDA support)
|
||||
*
|
||||
* Error Handling:
|
||||
* ---------------
|
||||
* - Graceful fallback to scalar operations for unsupported configurations
|
||||
* - Compile-time detection of AOCL availability
|
||||
* - Runtime size and alignment validation with eigen_assert()
|
||||
*
|
||||
* Developer:
|
||||
* ----------
|
||||
* Name: Sharad Saurabh Bhaskar
|
||||
* Email: shbhaska@amd.com
|
||||
* Organization: Advanced Micro Devices, Inc.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef EIGEN_ASSIGN_AOCL_H
|
||||
#define EIGEN_ASSIGN_AOCL_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
// Traits for unary operations.
|
||||
template <typename Dst, typename Src> class aocl_assign_traits {
|
||||
private:
|
||||
enum {
|
||||
DstHasDirectAccess = !!(Dst::Flags & DirectAccessBit),
|
||||
SrcHasDirectAccess = !!(Src::Flags & DirectAccessBit),
|
||||
StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Src::IsRowMajor)),
|
||||
InnerSize = Dst::IsVectorAtCompileTime ? int(Dst::SizeAtCompileTime)
|
||||
: (Dst::Flags & RowMajorBit) ? int(Dst::ColsAtCompileTime)
|
||||
: int(Dst::RowsAtCompileTime),
|
||||
LargeEnough =
|
||||
(InnerSize == Dynamic) || (InnerSize >= EIGEN_AOCL_VML_THRESHOLD)
|
||||
};
|
||||
|
||||
public:
|
||||
enum {
|
||||
EnableAoclVML = DstHasDirectAccess && SrcHasDirectAccess &&
|
||||
StorageOrdersAgree && LargeEnough,
|
||||
Traversal = LinearTraversal
|
||||
};
|
||||
};
|
||||
|
||||
// Traits for binary operations (e.g., add, pow).
|
||||
template <typename Dst, typename Lhs, typename Rhs>
|
||||
class aocl_assign_binary_traits {
|
||||
private:
|
||||
enum {
|
||||
DstHasDirectAccess = !!(Dst::Flags & DirectAccessBit),
|
||||
LhsHasDirectAccess = !!(Lhs::Flags & DirectAccessBit),
|
||||
RhsHasDirectAccess = !!(Rhs::Flags & DirectAccessBit),
|
||||
StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Lhs::IsRowMajor)) &&
|
||||
(int(Dst::IsRowMajor) == int(Rhs::IsRowMajor)),
|
||||
InnerSize = Dst::IsVectorAtCompileTime ? int(Dst::SizeAtCompileTime)
|
||||
: (Dst::Flags & RowMajorBit) ? int(Dst::ColsAtCompileTime)
|
||||
: int(Dst::RowsAtCompileTime),
|
||||
LargeEnough =
|
||||
(InnerSize == Dynamic) || (InnerSize >= EIGEN_AOCL_VML_THRESHOLD)
|
||||
};
|
||||
|
||||
public:
|
||||
enum {
|
||||
EnableAoclVML = DstHasDirectAccess && LhsHasDirectAccess &&
|
||||
RhsHasDirectAccess && StorageOrdersAgree && LargeEnough
|
||||
};
|
||||
};
|
||||
|
||||
// Unary operation dispatch for float (scalar fallback).
|
||||
#define EIGEN_AOCL_VML_UNARY_CALL_FLOAT(EIGENOP) \
|
||||
template <typename DstXprType, typename SrcXprNested> \
|
||||
struct Assignment< \
|
||||
DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<float>, SrcXprNested>, \
|
||||
assign_op<float, float>, Dense2Dense, \
|
||||
std::enable_if_t< \
|
||||
aocl_assign_traits<DstXprType, SrcXprNested>::EnableAoclVML>> { \
|
||||
typedef CwiseUnaryOp<scalar_##EIGENOP##_op<float>, SrcXprNested> \
|
||||
SrcXprType; \
|
||||
static void run(DstXprType &dst, const SrcXprType &src, \
|
||||
const assign_op<float, float> &) { \
|
||||
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
|
||||
Eigen::Index n = dst.size(); \
|
||||
if (n <= 0) \
|
||||
return; \
|
||||
const float *input = \
|
||||
reinterpret_cast<const float *>(src.nestedExpression().data()); \
|
||||
float *output = reinterpret_cast<float *>(dst.data()); \
|
||||
for (Eigen::Index i = 0; i < n; ++i) { \
|
||||
output[i] = std::EIGENOP(input[i]); \
|
||||
} \
|
||||
} \
|
||||
};
|
||||
|
||||
// Unary operation dispatch for double (AOCL vectorized).
|
||||
#define EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(EIGENOP, AOCLOP) \
|
||||
template <typename DstXprType, typename SrcXprNested> \
|
||||
struct Assignment< \
|
||||
DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<double>, SrcXprNested>, \
|
||||
assign_op<double, double>, Dense2Dense, \
|
||||
std::enable_if_t< \
|
||||
aocl_assign_traits<DstXprType, SrcXprNested>::EnableAoclVML>> { \
|
||||
typedef CwiseUnaryOp<scalar_##EIGENOP##_op<double>, SrcXprNested> \
|
||||
SrcXprType; \
|
||||
static void run(DstXprType &dst, const SrcXprType &src, \
|
||||
const assign_op<double, double> &) { \
|
||||
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
|
||||
Eigen::Index n = dst.size(); \
|
||||
eigen_assert(n <= INT_MAX && "AOCL does not support arrays larger than INT_MAX"); \
|
||||
if (n <= 0) \
|
||||
return; \
|
||||
const double *input = \
|
||||
reinterpret_cast<const double *>(src.nestedExpression().data()); \
|
||||
double *output = reinterpret_cast<double *>(dst.data()); \
|
||||
int aocl_n = internal::convert_index<int>(n); \
|
||||
AOCLOP(aocl_n, const_cast<double *>(input), output); \
|
||||
} \
|
||||
};
|
||||
|
||||
// Instantiate unary calls for float (scalar).
|
||||
// EIGEN_AOCL_VML_UNARY_CALL_FLOAT(exp)
|
||||
|
||||
// Instantiate unary calls for double (AOCL vectorized).
|
||||
EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(exp2, amd_vrda_exp2)
|
||||
EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(exp, amd_vrda_exp)
|
||||
EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(sin, amd_vrda_sin)
|
||||
EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(cos, amd_vrda_cos)
|
||||
EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(sqrt, amd_vrda_sqrt)
|
||||
EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(cbrt, amd_vrda_cbrt)
|
||||
EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(abs, amd_vrda_fabs)
|
||||
EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(log, amd_vrda_log)
|
||||
EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(log10, amd_vrda_log10)
|
||||
EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(log2, amd_vrda_log2)
|
||||
|
||||
// Binary operation dispatch for float (scalar fallback).
|
||||
#define EIGEN_AOCL_VML_BINARY_CALL_FLOAT(EIGENOP, STDFUNC) \
|
||||
template <typename DstXprType, typename LhsXprNested, typename RhsXprNested> \
|
||||
struct Assignment< \
|
||||
DstXprType, \
|
||||
CwiseBinaryOp<scalar_##EIGENOP##_op<float, float>, LhsXprNested, \
|
||||
RhsXprNested>, \
|
||||
assign_op<float, float>, Dense2Dense, \
|
||||
std::enable_if_t<aocl_assign_binary_traits< \
|
||||
DstXprType, LhsXprNested, RhsXprNested>::EnableAoclVML>> { \
|
||||
typedef CwiseBinaryOp<scalar_##EIGENOP##_op<float, float>, LhsXprNested, \
|
||||
RhsXprNested> \
|
||||
SrcXprType; \
|
||||
static void run(DstXprType &dst, const SrcXprType &src, \
|
||||
const assign_op<float, float> &) { \
|
||||
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
|
||||
Eigen::Index n = dst.size(); \
|
||||
if (n <= 0) \
|
||||
return; \
|
||||
const float *lhs = reinterpret_cast<const float *>(src.lhs().data()); \
|
||||
const float *rhs = reinterpret_cast<const float *>(src.rhs().data()); \
|
||||
float *output = reinterpret_cast<float *>(dst.data()); \
|
||||
for (Eigen::Index i = 0; i < n; ++i) { \
|
||||
output[i] = STDFUNC(lhs[i], rhs[i]); \
|
||||
} \
|
||||
} \
|
||||
};
|
||||
|
||||
// Binary operation dispatch for double (AOCL vectorized).
|
||||
#define EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(EIGENOP, AOCLOP) \
|
||||
template <typename DstXprType, typename LhsXprNested, typename RhsXprNested> \
|
||||
struct Assignment< \
|
||||
DstXprType, \
|
||||
CwiseBinaryOp<scalar_##EIGENOP##_op<double, double>, LhsXprNested, \
|
||||
RhsXprNested>, \
|
||||
assign_op<double, double>, Dense2Dense, \
|
||||
std::enable_if_t<aocl_assign_binary_traits< \
|
||||
DstXprType, LhsXprNested, RhsXprNested>::EnableAoclVML>> { \
|
||||
typedef CwiseBinaryOp<scalar_##EIGENOP##_op<double, double>, LhsXprNested, \
|
||||
RhsXprNested> \
|
||||
SrcXprType; \
|
||||
static void run(DstXprType &dst, const SrcXprType &src, \
|
||||
const assign_op<double, double> &) { \
|
||||
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
|
||||
Eigen::Index n = dst.size(); \
|
||||
eigen_assert(n <= INT_MAX && "AOCL does not support arrays larger than INT_MAX"); \
|
||||
if (n <= 0) \
|
||||
return; \
|
||||
const double *lhs = reinterpret_cast<const double *>(src.lhs().data()); \
|
||||
const double *rhs = reinterpret_cast<const double *>(src.rhs().data()); \
|
||||
double *output = reinterpret_cast<double *>(dst.data()); \
|
||||
int aocl_n = internal::convert_index<int>(n); \
|
||||
AOCLOP(aocl_n, const_cast<double *>(lhs), const_cast<double *>(rhs), output); \
|
||||
} \
|
||||
};
|
||||
|
||||
// Instantiate binary calls for float (scalar).
|
||||
// EIGEN_AOCL_VML_BINARY_CALL_FLOAT(sum, std::plus<float>) // Using
|
||||
// scalar_sum_op for addition EIGEN_AOCL_VML_BINARY_CALL_FLOAT(pow, std::pow)
|
||||
|
||||
// Instantiate binary calls for double (AOCL vectorized).
|
||||
EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(sum, amd_vrda_add) // Using scalar_sum_op for addition
|
||||
EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(pow, amd_vrda_pow)
|
||||
EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(max, amd_vrda_fmax)
|
||||
EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(min, amd_vrda_fmin)
|
||||
|
||||
} // namespace internal
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_ASSIGN_AOCL_H
|
||||
@@ -32,6 +32,7 @@ EIGEN_DOUBLE_PACKET_FUNCTION(cbrt, Packet4d)
|
||||
#ifdef EIGEN_VECTORIZE_AVX2
|
||||
EIGEN_DOUBLE_PACKET_FUNCTION(sin, Packet4d)
|
||||
EIGEN_DOUBLE_PACKET_FUNCTION(cos, Packet4d)
|
||||
EIGEN_DOUBLE_PACKET_FUNCTION(tan, Packet4d)
|
||||
#endif
|
||||
EIGEN_GENERIC_PACKET_FUNCTION(atan, Packet4d)
|
||||
EIGEN_GENERIC_PACKET_FUNCTION(exp2, Packet4d)
|
||||
|
||||
@@ -110,6 +110,7 @@ struct packet_traits<float> : default_packet_traits {
|
||||
HasReciprocal = EIGEN_FAST_MATH,
|
||||
HasSin = EIGEN_FAST_MATH,
|
||||
HasCos = EIGEN_FAST_MATH,
|
||||
HasTan = EIGEN_FAST_MATH,
|
||||
HasACos = 1,
|
||||
HasASin = 1,
|
||||
HasATan = 1,
|
||||
@@ -143,6 +144,7 @@ struct packet_traits<double> : default_packet_traits {
|
||||
#ifdef EIGEN_VECTORIZE_AVX2
|
||||
HasSin = EIGEN_FAST_MATH,
|
||||
HasCos = EIGEN_FAST_MATH,
|
||||
HasTan = EIGEN_FAST_MATH,
|
||||
#endif
|
||||
HasTanh = EIGEN_FAST_MATH,
|
||||
HasErf = 1,
|
||||
|
||||
@@ -119,6 +119,7 @@ struct packet_traits<float> : default_packet_traits {
|
||||
HasConj = 1,
|
||||
HasSin = EIGEN_FAST_MATH,
|
||||
HasCos = EIGEN_FAST_MATH,
|
||||
HasTan = EIGEN_FAST_MATH,
|
||||
HasACos = 1,
|
||||
HasASin = 1,
|
||||
HasATan = 1,
|
||||
@@ -154,6 +155,7 @@ struct packet_traits<double> : default_packet_traits {
|
||||
HasCbrt = 1,
|
||||
HasSin = EIGEN_FAST_MATH,
|
||||
HasCos = EIGEN_FAST_MATH,
|
||||
HasTan = EIGEN_FAST_MATH,
|
||||
HasLog = 1,
|
||||
HasExp = 1,
|
||||
HasLog1p = 1,
|
||||
|
||||
@@ -178,6 +178,7 @@ struct packet_traits<float> : default_packet_traits {
|
||||
HasAbs = 1,
|
||||
HasSin = EIGEN_FAST_MATH,
|
||||
HasCos = EIGEN_FAST_MATH,
|
||||
HasTan = EIGEN_FAST_MATH,
|
||||
HasACos = 1,
|
||||
HasASin = 1,
|
||||
HasATan = 1,
|
||||
@@ -3098,6 +3099,7 @@ struct packet_traits<double> : default_packet_traits {
|
||||
HasAbs = 1,
|
||||
HasSin = EIGEN_FAST_MATH,
|
||||
HasCos = EIGEN_FAST_MATH,
|
||||
HasTan = EIGEN_FAST_MATH,
|
||||
HasTanh = EIGEN_FAST_MATH,
|
||||
HasErf = EIGEN_FAST_MATH,
|
||||
HasErfc = EIGEN_FAST_MATH,
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
// Copyright (C) 2007 Julien Pommier
|
||||
// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
|
||||
// Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
|
||||
// Copyright (C) 2018-2025 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
@@ -641,7 +642,7 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_expm1(const P
|
||||
// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
|
||||
// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
|
||||
// exp(r) is computed using a 6th order minimax polynomial approximation.
|
||||
template <typename Packet>
|
||||
template <typename Packet, bool IsFinite>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Packet _x) {
|
||||
const Packet cst_zero = pset1<Packet>(0.0f);
|
||||
const Packet cst_one = pset1<Packet>(1.0f);
|
||||
@@ -656,10 +657,15 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Pack
|
||||
const Packet cst_p4 = pset1<Packet>(4.166965186595916748046875e-2f);
|
||||
const Packet cst_p5 = pset1<Packet>(8.36894474923610687255859375e-3f);
|
||||
const Packet cst_p6 = pset1<Packet>(1.37449637986719608306884765625e-3f);
|
||||
|
||||
// Clamp x.
|
||||
Packet zero_mask = pcmp_lt(_x, cst_exp_lo);
|
||||
Packet x = pmin(_x, cst_exp_hi);
|
||||
Packet zero_mask;
|
||||
Packet x;
|
||||
if (!IsFinite) {
|
||||
// Clamp x.
|
||||
zero_mask = pcmp_lt(_x, cst_exp_lo);
|
||||
x = pmin(_x, cst_exp_hi);
|
||||
} else {
|
||||
x = _x;
|
||||
}
|
||||
|
||||
// Express exp(x) as exp(m*ln(2) + r), start by extracting
|
||||
// m = floor(x/ln(2) + 0.5).
|
||||
@@ -682,7 +688,9 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Pack
|
||||
const Packet p_low = padd(r, cst_one);
|
||||
Packet y = pmadd(r, p_odd, p_even);
|
||||
y = pmadd(r2, y, p_low);
|
||||
|
||||
if (IsFinite) {
|
||||
return pldexp_fast(y, m);
|
||||
}
|
||||
// Return 2^m * exp(r).
|
||||
const Packet fast_pldexp_unsafe = pcmp_lt(cst_pldexp_threshold, pabs(x));
|
||||
if (!predux_any(fast_pldexp_unsafe)) {
|
||||
@@ -765,6 +773,11 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_double(const Pac
|
||||
return pselect(zero_mask, cst_zero, pmax(pldexp(x, fx), _x));
|
||||
}
|
||||
|
||||
// Enum for selecting which function to compute. SinCos is intended to compute
|
||||
// pairs of Sin and Cos of the even entries in the packet, e.g.
|
||||
// SinCos([a, *, b, *]) = [sin(a), cos(a), sin(b), cos(b)].
|
||||
enum class TrigFunction : uint8_t { Sin, Cos, Tan, SinCos };
|
||||
|
||||
// The following code is inspired by the following stack-overflow answer:
|
||||
// https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
|
||||
// It has been largely optimized:
|
||||
@@ -821,7 +834,7 @@ inline float trig_reduce_huge(float xf, Eigen::numext::int32_t* quadrant) {
|
||||
return float(double(int64_t(p)) * pio2_62);
|
||||
}
|
||||
|
||||
template <bool ComputeSine, typename Packet, bool ComputeBoth = false>
|
||||
template <TrigFunction Func, typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
#if EIGEN_COMP_GNUC_STRICT
|
||||
__attribute__((optimize("-fno-unsafe-math-optimizations")))
|
||||
@@ -851,7 +864,7 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
#if defined(EIGEN_VECTORIZE_FMA)
|
||||
// This version requires true FMA for high accuracy.
|
||||
// It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
|
||||
const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
|
||||
constexpr float huge_th = (Func == TrigFunction::Sin) ? 117435.992f : 71476.0625f;
|
||||
x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
|
||||
x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
|
||||
x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
|
||||
@@ -862,7 +875,7 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
|
||||
// The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
|
||||
// and 2 ULP up to:
|
||||
const float huge_th = ComputeSine ? 25966.f : 18838.f;
|
||||
constexpr float huge_th = (Func == TrigFunction::Sin) ? 25966.f : 18838.f;
|
||||
x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
|
||||
EIGEN_OPTIMIZATION_BARRIER(x)
|
||||
x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
|
||||
@@ -900,13 +913,6 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
y_int = ploadu<PacketI>(y_int2);
|
||||
}
|
||||
|
||||
// Compute the sign to apply to the polynomial.
|
||||
// sin: sign = second_bit(y_int) xor signbit(_x)
|
||||
// cos: sign = second_bit(y_int+1)
|
||||
Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)))
|
||||
: preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
|
||||
sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
|
||||
|
||||
// Get the polynomial selection mask from the second bit of y_int
|
||||
// We'll calculate both (sin and cos) polynomials and then select from the two.
|
||||
Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
|
||||
@@ -935,7 +941,15 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
y2 = pmadd(y2, x, x);
|
||||
|
||||
// Select the correct result from the two polynomials.
|
||||
if (ComputeBoth) {
|
||||
// Compute the sign to apply to the polynomial.
|
||||
// sin: sign = second_bit(y_int) xor signbit(_x)
|
||||
// cos: sign = second_bit(y_int+1)
|
||||
Packet sign_bit = (Func == TrigFunction::Sin) ? pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)))
|
||||
: preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
|
||||
sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
|
||||
|
||||
if ((Func == TrigFunction::SinCos) || (Func == TrigFunction::Tan)) {
|
||||
// TODO(rmlarsen): Add single polynomial for tan(x) instead of paying for sin+cos+div.
|
||||
Packet peven = peven_mask(x);
|
||||
Packet ysin = pselect(poly_mask, y2, y1);
|
||||
Packet ycos = pselect(poly_mask, y1, y2);
|
||||
@@ -943,23 +957,28 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet sign_bit_cos = preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
|
||||
sign_bit_sin = pand(sign_bit_sin, cst_sign_mask); // clear all but left most bit
|
||||
sign_bit_cos = pand(sign_bit_cos, cst_sign_mask); // clear all but left most bit
|
||||
y = pselect(peven, pxor(ysin, sign_bit_sin), pxor(ycos, sign_bit_cos));
|
||||
y = (Func == TrigFunction::SinCos) ? pselect(peven, pxor(ysin, sign_bit_sin), pxor(ycos, sign_bit_cos))
|
||||
: pdiv(pxor(ysin, sign_bit_sin), pxor(ycos, sign_bit_cos));
|
||||
} else {
|
||||
y = ComputeSine ? pselect(poly_mask, y2, y1) : pselect(poly_mask, y1, y2);
|
||||
y = (Func == TrigFunction::Sin) ? pselect(poly_mask, y2, y1) : pselect(poly_mask, y1, y2);
|
||||
y = pxor(y, sign_bit);
|
||||
}
|
||||
// Update the sign and filter huge inputs
|
||||
return y;
|
||||
}
|
||||
|
||||
template <typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_float(const Packet& x) {
|
||||
return psincos_float<true>(x);
|
||||
return psincos_float<TrigFunction::Sin>(x);
|
||||
}
|
||||
|
||||
template <typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet& x) {
|
||||
return psincos_float<false>(x);
|
||||
return psincos_float<TrigFunction::Cos>(x);
|
||||
}
|
||||
|
||||
template <typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan_float(const Packet& x) {
|
||||
return psincos_float<TrigFunction::Tan>(x);
|
||||
}
|
||||
|
||||
// Trigonometric argument reduction for double for inputs smaller than 15.
|
||||
@@ -999,7 +1018,7 @@ Packet trig_reduce_medium_double(const Packet& x, const Packet& q_high, const Pa
|
||||
return t;
|
||||
}
|
||||
|
||||
template <bool ComputeSine, typename Packet, bool ComputeBoth = false>
|
||||
template <TrigFunction Func, typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
#if EIGEN_COMP_GNUC_STRICT
|
||||
__attribute__((optimize("-fno-unsafe-math-optimizations")))
|
||||
@@ -1086,19 +1105,26 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
Packet sign_sin = pxor(x, preinterpret<Packet>(plogical_shift_left<62>(q_int)));
|
||||
Packet sign_cos = preinterpret<Packet>(plogical_shift_left<62>(padd(q_int, cst_one)));
|
||||
Packet sign_bit, sFinalRes;
|
||||
if (ComputeBoth) {
|
||||
if (Func == TrigFunction::Sin) {
|
||||
sign_bit = sign_sin;
|
||||
sFinalRes = pselect(poly_mask, ssin, scos);
|
||||
} else if (Func == TrigFunction::Cos) {
|
||||
sign_bit = sign_cos;
|
||||
sFinalRes = pselect(poly_mask, scos, ssin);
|
||||
} else if (Func == TrigFunction::Tan) {
|
||||
// TODO(rmlarsen): Add single polynomial for tan(x) instead of paying for sin+cos+div.
|
||||
sign_bit = pxor(sign_sin, sign_cos);
|
||||
sFinalRes = pdiv(pselect(poly_mask, ssin, scos), pselect(poly_mask, scos, ssin));
|
||||
} else if (Func == TrigFunction::SinCos) {
|
||||
Packet peven = peven_mask(x);
|
||||
sign_bit = pselect((s), sign_sin, sign_cos);
|
||||
sFinalRes = pselect(pxor(peven, poly_mask), ssin, scos);
|
||||
} else {
|
||||
sign_bit = ComputeSine ? sign_sin : sign_cos;
|
||||
sFinalRes = ComputeSine ? pselect(poly_mask, ssin, scos) : pselect(poly_mask, scos, ssin);
|
||||
}
|
||||
sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
|
||||
sFinalRes = pxor(sFinalRes, sign_bit);
|
||||
|
||||
// If the inputs values are higher than that a value that the argument reduction can currently address, compute them
|
||||
// using std::sin and std::cos
|
||||
// using the C++ standard library.
|
||||
// TODO Remove it when huge angle argument reduction is implemented
|
||||
if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(huge_th), x_abs)))) {
|
||||
const int PacketSize = unpacket_traits<Packet>::size;
|
||||
@@ -1109,10 +1135,15 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
for (int k = 0; k < PacketSize; ++k) {
|
||||
double val = x_cpy[k];
|
||||
if (std::abs(val) > huge_th && (numext::isfinite)(val)) {
|
||||
if (ComputeBoth)
|
||||
if (Func == TrigFunction::Sin) {
|
||||
sincos_vals[k] = std::sin(val);
|
||||
} else if (Func == TrigFunction::Cos) {
|
||||
sincos_vals[k] = std::cos(val);
|
||||
} else if (Func == TrigFunction::Tan) {
|
||||
sincos_vals[k] = std::tan(val);
|
||||
} else if (Func == TrigFunction::SinCos) {
|
||||
sincos_vals[k] = k % 2 == 0 ? std::sin(val) : std::cos(val);
|
||||
else
|
||||
sincos_vals[k] = ComputeSine ? std::sin(val) : std::cos(val);
|
||||
}
|
||||
}
|
||||
}
|
||||
sFinalRes = ploadu<Packet>(sincos_vals);
|
||||
@@ -1122,26 +1153,31 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
|
||||
template <typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_double(const Packet& x) {
|
||||
return psincos_double<true>(x);
|
||||
return psincos_double<TrigFunction::Sin>(x);
|
||||
}
|
||||
|
||||
template <typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_double(const Packet& x) {
|
||||
return psincos_double<false>(x);
|
||||
return psincos_double<TrigFunction::Cos>(x);
|
||||
}
|
||||
|
||||
template <bool ComputeSin, typename Packet>
|
||||
template <typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan_double(const Packet& x) {
|
||||
return psincos_double<TrigFunction::Tan>(x);
|
||||
}
|
||||
|
||||
template <typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
std::enable_if_t<std::is_same<typename unpacket_traits<Packet>::type, float>::value, Packet>
|
||||
psincos_selector(const Packet& x) {
|
||||
return psincos_float<ComputeSin, Packet, true>(x);
|
||||
return psincos_float<TrigFunction::SinCos, Packet>(x);
|
||||
}
|
||||
|
||||
template <bool ComputeSin, typename Packet>
|
||||
template <typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||
std::enable_if_t<std::is_same<typename unpacket_traits<Packet>::type, double>::value, Packet>
|
||||
psincos_selector(const Packet& x) {
|
||||
return psincos_double<ComputeSin, Packet, true>(x);
|
||||
return psincos_double<TrigFunction::SinCos, Packet>(x);
|
||||
}
|
||||
|
||||
// Generic implementation of acos(x).
|
||||
@@ -1292,6 +1328,8 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_atan(const Pa
|
||||
return pxor(result, x_signmask);
|
||||
}
|
||||
|
||||
#ifdef EIGEN_FAST_MATH
|
||||
|
||||
/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
|
||||
Doesn't do anything fancy, just a 9/8-degree rational interpolant which
|
||||
is accurate up to a couple of ulps in the (approximate) range [-8, 8],
|
||||
@@ -1343,6 +1381,55 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_float(const T& a_x)
|
||||
return pdiv(p, q);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/** \internal \returns the hyperbolic tan of \a a (coeff-wise).
|
||||
On the domain [-1.25:1.25] we use an approximation of the form
|
||||
tanh(x) ~= x^3 * (P(x) / Q(x)) + x, where P and Q are polynomials in x^2.
|
||||
For |x| > 1.25, tanh is implememented as tanh(x) = 1 - (2 / (1 + exp(2*x))).
|
||||
|
||||
This implementation has a maximum error of 1 ULP (measured with AVX2+FMA).
|
||||
|
||||
This implementation works on both scalars and packets.
|
||||
*/
|
||||
template <typename T>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_float(const T& x) {
|
||||
// The polynomial coefficients were computed using Rminimax:
|
||||
// % ./ratapprox --function="tanh(x)-x" --dom='[-1.25,1.25]' --num="[x^3,x^5]" --den="even"
|
||||
// --type="[3,4]" --numF="[SG]" --denF="[SG]" --log --dispCoeff="dec" --output=tanhf.solly
|
||||
constexpr float alpha[] = {-1.46725140511989593505859375e-02f, -3.333333432674407958984375e-01f};
|
||||
constexpr float beta[] = {1.570280082523822784423828125e-02, 4.4401752948760986328125e-01, 1.0f};
|
||||
const T x2 = pmul(x, x);
|
||||
const T x3 = pmul(x2, x);
|
||||
const T p = ppolevl<T, 1>::run(x2, alpha);
|
||||
const T q = ppolevl<T, 2>::run(x2, beta);
|
||||
const T small_tanh = pmadd(x3, pdiv(p, q), x);
|
||||
|
||||
const T sign_mask = pset1<T>(-0.0f);
|
||||
const T abs_x = pandnot(x, sign_mask);
|
||||
constexpr float kSmallThreshold = 1.25f;
|
||||
const T large_mask = pcmp_lt(pset1<T>(kSmallThreshold), abs_x);
|
||||
// Fast exit if all elements are small.
|
||||
if (!predux_any(large_mask)) {
|
||||
return small_tanh;
|
||||
}
|
||||
|
||||
// Compute as 1 - (2 / (1 + exp(2*x)))
|
||||
const T one = pset1<T>(1.0f);
|
||||
const T two = pset1<T>(2.0f);
|
||||
const T s = pexp_float<T, true>(pmul(two, abs_x));
|
||||
const T abs_tanh = psub(one, pdiv(two, padd(s, one)));
|
||||
|
||||
// Handle infinite inputs and set sign bit.
|
||||
constexpr float kHugeThreshold = 16.0f;
|
||||
const T huge_mask = pcmp_lt(pset1<T>(kHugeThreshold), abs_x);
|
||||
const T x_sign = pand(sign_mask, x);
|
||||
const T large_tanh = por(x_sign, pselect(huge_mask, one, abs_tanh));
|
||||
return pselect(large_mask, large_tanh, small_tanh);
|
||||
}
|
||||
|
||||
#endif // EIGEN_FAST_MATH
|
||||
|
||||
/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
|
||||
This uses a 19/18-degree rational interpolant which
|
||||
is accurate up to a couple of ulps in the (approximate) range [-18.7, 18.7],
|
||||
@@ -1540,7 +1627,7 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_complex(const Pa
|
||||
// cis(y):
|
||||
RealPacket y = pand(odd_mask, a.v);
|
||||
y = por(y, pcplxflip(Packet(y)).v);
|
||||
RealPacket cisy = psincos_selector<false, RealPacket>(y);
|
||||
RealPacket cisy = psincos_selector<RealPacket>(y);
|
||||
cisy = pcplxflip(Packet(cisy)).v; // cos(y) + i * sin(y)
|
||||
|
||||
const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
|
||||
|
||||
@@ -95,7 +95,7 @@ template <typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_exp2(const Packet& x);
|
||||
|
||||
/** \internal \returns exp(x) for single precision float */
|
||||
template <typename Packet>
|
||||
template <typename Packet, bool IsFinite = false>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Packet _x);
|
||||
|
||||
/** \internal \returns exp(x) for double precision real numbers */
|
||||
@@ -110,6 +110,10 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_float(const Pack
|
||||
template <typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet& x);
|
||||
|
||||
/** \internal \returns tan(x) for single precision float */
|
||||
template <typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan_float(const Packet& x);
|
||||
|
||||
/** \internal \returns sin(x) for double precision float */
|
||||
template <typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_double(const Packet& x);
|
||||
@@ -118,6 +122,10 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_double(const Pac
|
||||
template <typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_double(const Packet& x);
|
||||
|
||||
/** \internal \returns tan(x) for double precision float */
|
||||
template <typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan_double(const Packet& x);
|
||||
|
||||
/** \internal \returns asin(x) for single precision float */
|
||||
template <typename Packet>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Packet& x);
|
||||
@@ -200,6 +208,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_round(const Packet& a);
|
||||
#define EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PACKET) \
|
||||
EIGEN_FLOAT_PACKET_FUNCTION(sin, PACKET) \
|
||||
EIGEN_FLOAT_PACKET_FUNCTION(cos, PACKET) \
|
||||
EIGEN_FLOAT_PACKET_FUNCTION(tan, PACKET) \
|
||||
EIGEN_FLOAT_PACKET_FUNCTION(asin, PACKET) \
|
||||
EIGEN_FLOAT_PACKET_FUNCTION(acos, PACKET) \
|
||||
EIGEN_FLOAT_PACKET_FUNCTION(tanh, PACKET) \
|
||||
@@ -216,6 +225,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_round(const Packet& a);
|
||||
#define EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PACKET) \
|
||||
EIGEN_DOUBLE_PACKET_FUNCTION(sin, PACKET) \
|
||||
EIGEN_DOUBLE_PACKET_FUNCTION(cos, PACKET) \
|
||||
EIGEN_DOUBLE_PACKET_FUNCTION(tan, PACKET) \
|
||||
EIGEN_DOUBLE_PACKET_FUNCTION(log, PACKET) \
|
||||
EIGEN_DOUBLE_PACKET_FUNCTION(log2, PACKET) \
|
||||
EIGEN_DOUBLE_PACKET_FUNCTION(exp, PACKET) \
|
||||
|
||||
@@ -197,6 +197,7 @@ struct packet_traits<float> : default_packet_traits {
|
||||
HasDiv = 1,
|
||||
HasSin = EIGEN_FAST_MATH,
|
||||
HasCos = EIGEN_FAST_MATH,
|
||||
HasTan = EIGEN_FAST_MATH,
|
||||
HasACos = 1,
|
||||
HasASin = 1,
|
||||
HasATan = 1,
|
||||
@@ -5017,6 +5018,7 @@ struct packet_traits<double> : default_packet_traits {
|
||||
#endif
|
||||
HasSin = EIGEN_FAST_MATH,
|
||||
HasCos = EIGEN_FAST_MATH,
|
||||
HasTan = EIGEN_FAST_MATH,
|
||||
HasSqrt = 1,
|
||||
HasRsqrt = 1,
|
||||
HasCbrt = 1,
|
||||
|
||||
@@ -507,6 +507,7 @@ struct packet_traits<float> : default_packet_traits {
|
||||
|
||||
HasSin = EIGEN_FAST_MATH,
|
||||
HasCos = EIGEN_FAST_MATH,
|
||||
HasTan = EIGEN_FAST_MATH,
|
||||
HasLog = 1,
|
||||
HasExp = 1,
|
||||
HasSqrt = 1,
|
||||
|
||||
@@ -183,6 +183,7 @@ struct packet_traits<float> : default_packet_traits {
|
||||
HasReciprocal = EIGEN_FAST_MATH,
|
||||
HasSin = EIGEN_FAST_MATH,
|
||||
HasCos = EIGEN_FAST_MATH,
|
||||
HasTan = EIGEN_FAST_MATH,
|
||||
HasACos = 1,
|
||||
HasASin = 1,
|
||||
HasATan = 1,
|
||||
@@ -216,6 +217,7 @@ struct packet_traits<double> : default_packet_traits {
|
||||
HasDiv = 1,
|
||||
HasSin = EIGEN_FAST_MATH,
|
||||
HasCos = EIGEN_FAST_MATH,
|
||||
HasTan = EIGEN_FAST_MATH,
|
||||
HasTanh = EIGEN_FAST_MATH,
|
||||
HasErf = EIGEN_FAST_MATH,
|
||||
HasErfc = EIGEN_FAST_MATH,
|
||||
|
||||
@@ -36,6 +36,11 @@ EIGEN_STRONG_INLINE PacketXf pcos<PacketXf>(const PacketXf& x) {
|
||||
return pcos_float(x);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE PacketXf ptan<PacketXf>(const PacketXf& x) {
|
||||
return ptan_float(x);
|
||||
}
|
||||
|
||||
// Hyperbolic Tangent function.
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE PacketXf ptanh<PacketXf>(const PacketXf& x) {
|
||||
|
||||
@@ -353,6 +353,7 @@ struct packet_traits<float> : default_packet_traits {
|
||||
HasCmp = 1,
|
||||
HasSin = EIGEN_FAST_MATH,
|
||||
HasCos = EIGEN_FAST_MATH,
|
||||
HasTan = EIGEN_FAST_MATH,
|
||||
HasLog = 1,
|
||||
HasExp = 1,
|
||||
HasPow = 1,
|
||||
|
||||
@@ -56,6 +56,7 @@ struct generic_float_packet_traits : default_packet_traits {
|
||||
HasReciprocal = 1,
|
||||
HasSin = 1,
|
||||
HasCos = 1,
|
||||
HasTan = 1,
|
||||
HasACos = 1,
|
||||
HasASin = 1,
|
||||
HasATan = 1,
|
||||
|
||||
175
Eigen/src/Core/util/AOCL_Support.h
Normal file
175
Eigen/src/Core/util/AOCL_Support.h
Normal file
@@ -0,0 +1,175 @@
|
||||
/*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
||||
*
|
||||
* AOCL_Support.h - AMD Optimizing CPU Libraries Integration Header for Eigen
|
||||
*
|
||||
* Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Description:
|
||||
* ------------
|
||||
* This header file serves as the central configuration and integration point
|
||||
* for AMD Optimizing CPU Libraries (AOCL) with the Eigen C++ template library.
|
||||
* It orchestrates the integration of multiple AOCL components to provide
|
||||
* optimal mathematical computing performance on AMD Zen family processors.
|
||||
*
|
||||
* AOCL Component Integration:
|
||||
* ---------------------------
|
||||
* 1. AOCL Vector Math Library (VML):
|
||||
* - Provides VRDA (Vector Rapid Double-precision Arithmetic) functions
|
||||
* - Optimized transcendental functions: exp, sin, cos, sqrt, log, pow, etc.
|
||||
* - SIMD vectorization for AMD architectures (AVX2, AVX-512)
|
||||
* - Headers: amdlibm.h, amdlibm_vec.h
|
||||
*
|
||||
* 2. AOCL BLAS (BLIS - BLAS-like Library Instantiation Software):
|
||||
* - High-performance Basic Linear Algebra Subprograms
|
||||
* - Supports single-threaded (libblis) and multithreaded (libblis-mt)
|
||||
* variants
|
||||
* - Optimized matrix operations: GEMM, GEMV, TRSM, etc.
|
||||
* - Headers: cblas.h, blis.h
|
||||
*
|
||||
* 3. AOCL LAPACK (libFLAME - Formal Linear Algebra Methods Environment):
|
||||
* - Dense linear algebra operations: factorizations, eigenvalue solvers
|
||||
* - Matrix decompositions: LU, Cholesky, QR, SVD
|
||||
* - Eigenvalue/eigenvector computations optimized for AMD hardware
|
||||
* - Headers: LAPACKE interface
|
||||
*
|
||||
* ------------------------------
|
||||
* EIGEN_AOCL_VML_THRESHOLD (default: 128):
|
||||
* - Minimum vector size for AOCL VML dispatch
|
||||
* - Smaller vectors use standard Eigen to avoid function call overhead
|
||||
* - Optimal values: 64-512 depending on operation and data characteristics
|
||||
*
|
||||
*
|
||||
*
|
||||
* Architecture Support:
|
||||
* ---------------------
|
||||
* Optimized for AMD processor families:
|
||||
* - Zen Architecture (Naples, Rome): AVX2 optimization
|
||||
* - Zen 2 Architecture (Rome, Matisse): Enhanced AVX2
|
||||
* - Zen 3 Architecture (Milan, Vermeer): Improved IPC and cache
|
||||
* - Zen 4 Architecture (Genoa, Raphael): AVX-512 support
|
||||
* - Zen 5 Architecture (Turin, Granite Ridge): Enhanced AVX-512
|
||||
*
|
||||
*
|
||||
* Dependencies:
|
||||
* -------------
|
||||
* Required AOCL components:
|
||||
* - libamdlibm: Core math library with VRDA functions
|
||||
* - libblis or libblis-mt: BLAS implementation
|
||||
* - libflame: LAPACK implementation
|
||||
*
|
||||
* System requirements:
|
||||
* - AMD x86_64 processor (optimal performance)
|
||||
* - Linux, Windows, or compatible POSIX system
|
||||
* - C++11 or later standard
|
||||
* - CMake 3.5+ for build system integration
|
||||
*
|
||||
* Developer:
|
||||
* ----------
|
||||
* Name: Sharad Saurabh Bhaskar
|
||||
* Email: shbhaska@amd.com
|
||||
* Organization: Advanced Micro Devices, Inc.
|
||||
*/
|
||||
|
||||
#ifndef EIGEN_AOCL_SUPPORT_H
|
||||
#define EIGEN_AOCL_SUPPORT_H
|
||||
|
||||
#if defined(EIGEN_USE_AOCL_ALL) || defined(EIGEN_USE_AOCL_MT)
|
||||
|
||||
#include <complex>
|
||||
|
||||
// Define AOCL component flags based on main flags
|
||||
#ifdef EIGEN_USE_AOCL_ALL
|
||||
#define EIGEN_USE_AOCL_VML // Enable AOCL Vector Math Library
|
||||
#define EIGEN_USE_AOCL_BLAS // Enable AOCL BLAS (BLIS)
|
||||
|
||||
// Enable Eigen BLAS backend only if BLIS provides compatible interface
|
||||
#if defined(EIGEN_AOCL_BLIS_COMPATIBLE)
|
||||
#define EIGEN_USE_BLAS // Enable Eigen BLAS backend
|
||||
#endif
|
||||
|
||||
#define EIGEN_USE_LAPACKE // Enable LAPACK backend (FLAME)
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN_USE_AOCL_MT
|
||||
#define EIGEN_USE_AOCL_VML // Enable AOCL Vector Math Library
|
||||
#define EIGEN_USE_AOCL_BLAS // Enable AOCL BLAS (BLIS)
|
||||
|
||||
// For multithreaded: disable EIGEN_USE_BLAS to avoid signature conflicts
|
||||
// Use direct BLIS calls instead through EIGEN_USE_AOCL_BLAS
|
||||
// #define EIGEN_USE_BLAS // Commented out - causes conflicts with BLIS
|
||||
// interface
|
||||
|
||||
// Note: LAPACKE disabled in MT mode to avoid header conflicts
|
||||
#define EIGEN_USE_LAPACKE // Commented out - causes conflicts with BLIS LAPACKE
|
||||
#define EIGEN_AOCL_USE_BLIS_MT 1 // Enable multithreaded BLIS
|
||||
#endif
|
||||
|
||||
// Handle standalone EIGEN_USE_AOCL_VML flag
|
||||
#ifndef EIGEN_USE_AOCL_VML
|
||||
#ifdef EIGEN_USE_AOCL_ALL
|
||||
#define EIGEN_USE_AOCL_VML
|
||||
#endif
|
||||
#ifdef EIGEN_USE_AOCL_MT
|
||||
#define EIGEN_USE_AOCL_VML
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Configuration constants - define these for any AOCL usage
|
||||
#ifndef EIGEN_AOCL_VML_THRESHOLD
|
||||
#define EIGEN_AOCL_VML_THRESHOLD 128 // Threshold for VML dispatch
|
||||
#endif
|
||||
|
||||
#ifndef AOCL_SIMD_WIDTH
|
||||
#define AOCL_SIMD_WIDTH 8 // AVX-512: 512 bits / 64 bits per double
|
||||
#endif
|
||||
|
||||
// Include AOCL Math Library headers for VML
|
||||
#if defined(EIGEN_USE_AOCL_VML) || defined(EIGEN_USE_AOCL_ALL) || \
|
||||
defined(EIGEN_USE_AOCL_MT)
|
||||
#if defined(__has_include)
|
||||
#if __has_include("amdlibm.h")
|
||||
#include "amdlibm.h"
|
||||
#ifndef AMD_LIBM_VEC_EXPERIMENTAL
|
||||
#define AMD_LIBM_VEC_EXPERIMENTAL
|
||||
#endif
|
||||
#if __has_include("amdlibm_vec.h")
|
||||
#include "amdlibm_vec.h"
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
// Fallback for compilers without __has_include
|
||||
#include "amdlibm.h"
|
||||
#ifndef AMD_LIBM_VEC_EXPERIMENTAL
|
||||
#define AMD_LIBM_VEC_EXPERIMENTAL
|
||||
#endif
|
||||
#include "amdlibm_vec.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Include CBLAS headers when BLAS is enabled
|
||||
#ifdef EIGEN_USE_AOCL_BLAS
|
||||
#if defined(__has_include)
|
||||
#if __has_include("cblas.h")
|
||||
#include "cblas.h"
|
||||
#elif __has_include("blis.h")
|
||||
#include "blis.h"
|
||||
#endif
|
||||
#else
|
||||
// Fallback
|
||||
#include "cblas.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
namespace Eigen {
|
||||
// AOCL-specific type definitions
|
||||
typedef std::complex<double> dcomplex;
|
||||
typedef std::complex<float> scomplex;
|
||||
typedef int BlasIndex; // Standard BLAS index type
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_USE_AOCL_ALL || EIGEN_USE_AOCL_MT
|
||||
|
||||
#endif // EIGEN_AOCL_SUPPORT_H
|
||||
@@ -53,3 +53,56 @@ $ ./bench_multi_compilers.sh ompbench.cxxlist ompbenchmark.cpp
|
||||
|
||||
|
||||
|
||||
************************
|
||||
* benchmark_aocl *
|
||||
************************
|
||||
|
||||
This benchmark exercises Eigen operations using AMD Optimized Libraries
|
||||
(AOCL). It is disabled by default and can be enabled when configuring the
|
||||
build:
|
||||
|
||||
cmake .. -DEIGEN_BUILD_AOCL_BENCH=ON
|
||||
|
||||
The resulting `benchmark_aocl` target is compiled with `-O3` and, if the
|
||||
compiler supports it, `-march=znver5` for optimal performance on AMD
|
||||
processors.
|
||||
|
||||
The benchmark also links to `libblis-mt.so` and `libflame.so` so BLAS and
|
||||
LAPACK operations run with multithreaded AOCL when available.
|
||||
|
||||
By default the CMake build defines `EIGEN_USE_AOCL_MT` via the option
|
||||
`EIGEN_AOCL_BENCH_USE_MT` (enabled). Set this option to `OFF` if you want
|
||||
to build the benchmark using the single-threaded AOCL libraries instead,
|
||||
in which case `EIGEN_USE_AOCL_ALL` is defined.
|
||||
|
||||
|
||||
|
||||
Alternatively you can build the same benchmark using the
|
||||
`Makefile` in this directory. This allows experimenting with
|
||||
different compiler flags without reconfiguring CMake:
|
||||
|
||||
```
|
||||
cd bench && make # builds with -O3 -march=znver5 by default
|
||||
make clean && make CXX="clang++" ## For differnt compiler apart from g++
|
||||
make clean && make MARCH="" CXXFLAGS="-O2" # example of custom flags
|
||||
make AOCL_ROOT=/opt/aocl # use AOCL from a custom location
|
||||
|
||||
This Makefile links against `libblis-mt.so` and `libflame.so` so the
|
||||
matrix multiplication benchmark exercises multithreaded BLIS when
|
||||
`EIGEN_USE_AOCL_MT` is defined (enabled by default in the Makefile).
|
||||
|
||||
If you prefer to compile manually, ensure that the Eigen include path
|
||||
points to the directory where `AOCL_Support.h` resides. For example:
|
||||
|
||||
|
||||
clang++ -O3 -std=c++14 -I../build/install/include \
|
||||
-march=znver5 -DEIGEN_USE_AOCL_MT \
|
||||
benchmark_aocl.cpp -o benchmark_aocl \
|
||||
-lblis-mt -lflame -lamdlibm -lpthread -lm
|
||||
```
|
||||
Replace `../install/include` with your actual Eigen install path.
|
||||
|
||||
When invoking `make`, you can point `AOCL_ROOT` to your AOCL
|
||||
installation directory so the Makefile links against `$(AOCL_ROOT)/lib`.
|
||||
|
||||
|
||||
|
||||
362
bench/benchmark_aocl.cpp
Normal file
362
bench/benchmark_aocl.cpp
Normal file
@@ -0,0 +1,362 @@
|
||||
/*
|
||||
* benchmark_aocl.cpp - AOCL Performance Benchmark Suite for Eigen
|
||||
*
|
||||
* Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
*
|
||||
* Description:
|
||||
* ------------
|
||||
* This benchmark suite evaluates the performance of Eigen mathematical
|
||||
* operations when integrated with AMD Optimizing CPU Libraries (AOCL). It
|
||||
* tests:
|
||||
*
|
||||
* 1. Vector Math Operations: Transcendental functions (exp, sin, cos, sqrt,
|
||||
* log, etc.) using AOCL Vector Math Library (VML) for optimized
|
||||
* double-precision operations
|
||||
*
|
||||
* 2. Matrix Operations: BLAS Level-3 operations (DGEMM) using AOCL BLAS library
|
||||
* with support for both single-threaded and multithreaded execution
|
||||
*
|
||||
* 3. Linear Algebra: LAPACK operations (eigenvalue decomposition) using
|
||||
* libflame
|
||||
*
|
||||
* 4. Real-world Scenarios: Financial risk computation simulating covariance
|
||||
* matrix calculations and eigenvalue analysis for portfolio optimization
|
||||
*
|
||||
* The benchmark automatically detects AOCL configuration and adjusts test
|
||||
* execution accordingly, providing performance comparisons between standard
|
||||
* Eigen operations and AOCL-accelerated implementations.
|
||||
*
|
||||
* Compilation:
|
||||
* ------------
|
||||
* # Using AOCC compiler (recommended for best AOCL compatibility):
|
||||
* clang++ -O3 -g -DEIGEN_USE_AOCL_ALL -I<PATH_TO_EIGEN_INCLUDE>
|
||||
* -I${AOCL_ROOT}/include \
|
||||
* -Wno-parentheses src/benchmark_aocl.cpp -L${AOCL_ROOT}/lib \
|
||||
* -lamdlibm -lm -lblis -lflame -lpthread -lrt -pthread \
|
||||
* -o build/eigen_aocl_benchmark
|
||||
*
|
||||
* # Alternative: Using GCC with proper library paths:
|
||||
* g++ -O3 -g -DEIGEN_USE_AOCL_ALL -I<PATH_TO_EIGEN_INCLUDE>
|
||||
* -I${AOCL_ROOT}/include \
|
||||
* -Wno-parentheses src/benchmark_aocl.cpp -L${AOCL_ROOT}/lib \
|
||||
* -lamdlibm -lm -lblis -lflame -lpthread -lrt \
|
||||
* -o build/eigen_aocl_benchmark
|
||||
*
|
||||
* # For multithreaded BLIS support:
|
||||
* clang++ -O3 -g -fopenmp -DEIGEN_USE_AOCL_MT -I<PATH_TO_EIGEN_INCLUDE> \
|
||||
* -I${AOCL_ROOT}/include -Wno-parentheses src/benchmark_aocl.cpp \
|
||||
* -L${AOCL_ROOT}/lib -lamdlibm -lm -lblis-mt -lflame -lpthread -lrt \
|
||||
* -o build/eigen_aocl_benchmark_mt
|
||||
*
|
||||
* Usage:
|
||||
* ------
|
||||
* export AOCL_ROOT=/path/to/aocl/installation
|
||||
* export LD_LIBRARY_PATH=$AOCL_ROOT/lib:$LD_LIBRARY_PATH
|
||||
* ./build/eigen_aocl_benchmark
|
||||
*
|
||||
* Developer:
|
||||
* ----------
|
||||
* Name: Sharad Saurabh Bhaskar
|
||||
* Email: shbhaska@amd.com
|
||||
* Organization: Advanced Micro Devices, Inc.
|
||||
*/
|
||||
|
||||
#include <chrono>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
// Simple - just include Eigen headers
|
||||
#include <Eigen/Core>
|
||||
#include <Eigen/Dense>
|
||||
#include <Eigen/Eigenvalues>
|
||||
|
||||
// Only include CBLAS if AOCL BLIS is available
|
||||
#ifdef EIGEN_USE_AOCL_ALL
|
||||
#include <cblas.h>
|
||||
#endif
|
||||
|
||||
using namespace std;
|
||||
using namespace std::chrono;
|
||||
using namespace Eigen;
|
||||
|
||||
void benchmarkVectorMath(int size) {
|
||||
VectorXd v = VectorXd::LinSpaced(size, 0.1, 10.0);
|
||||
VectorXd result(size);
|
||||
double elapsed_ms = 0;
|
||||
|
||||
cout << "\n--- Vector Math Benchmark (size = " << size << ") ---" << endl;
|
||||
|
||||
auto start = high_resolution_clock::now();
|
||||
result = v.array().exp();
|
||||
auto end = high_resolution_clock::now();
|
||||
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
||||
cout << "exp() time: " << elapsed_ms << " ms" << endl;
|
||||
|
||||
start = high_resolution_clock::now();
|
||||
result = v.array().sin();
|
||||
end = high_resolution_clock::now();
|
||||
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
||||
cout << "sin() time: " << elapsed_ms << " ms" << endl;
|
||||
|
||||
start = high_resolution_clock::now();
|
||||
result = v.array().cos();
|
||||
end = high_resolution_clock::now();
|
||||
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
||||
cout << "cos() time: " << elapsed_ms << " ms" << endl;
|
||||
|
||||
start = high_resolution_clock::now();
|
||||
result = v.array().sqrt();
|
||||
end = high_resolution_clock::now();
|
||||
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
||||
cout << "sqrt() time: " << elapsed_ms << " ms" << endl;
|
||||
|
||||
start = high_resolution_clock::now();
|
||||
result = v.array().cbrt();
|
||||
end = high_resolution_clock::now();
|
||||
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
||||
cout << "cbrt() time: " << elapsed_ms << " ms" << endl;
|
||||
|
||||
start = high_resolution_clock::now();
|
||||
result = v.array().abs();
|
||||
end = high_resolution_clock::now();
|
||||
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
||||
cout << "abs() time: " << elapsed_ms << " ms" << endl;
|
||||
|
||||
start = high_resolution_clock::now();
|
||||
result = v.array().log();
|
||||
end = high_resolution_clock::now();
|
||||
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
||||
cout << "log() time: " << elapsed_ms << " ms" << endl;
|
||||
|
||||
start = high_resolution_clock::now();
|
||||
result = v.array().log10();
|
||||
end = high_resolution_clock::now();
|
||||
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
||||
cout << "log10() time: " << elapsed_ms << " ms" << endl;
|
||||
|
||||
start = high_resolution_clock::now();
|
||||
result = v.array().exp2();
|
||||
end = high_resolution_clock::now();
|
||||
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
||||
cout << "exp2() time: " << elapsed_ms << " ms" << endl;
|
||||
|
||||
start = high_resolution_clock::now();
|
||||
result = v.array().asin();
|
||||
end = high_resolution_clock::now();
|
||||
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
||||
cout << "asin() time: " << elapsed_ms << " ms" << endl;
|
||||
|
||||
start = high_resolution_clock::now();
|
||||
result = v.array().sinh();
|
||||
end = high_resolution_clock::now();
|
||||
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
||||
cout << "sinh() time: " << elapsed_ms << " ms" << endl;
|
||||
|
||||
start = high_resolution_clock::now();
|
||||
result = v.array().acos();
|
||||
end = high_resolution_clock::now();
|
||||
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
||||
cout << "acos() time: " << elapsed_ms << " ms" << endl;
|
||||
|
||||
start = high_resolution_clock::now();
|
||||
result = v.array().cosh();
|
||||
end = high_resolution_clock::now();
|
||||
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
||||
cout << "cosh() time: " << elapsed_ms << " ms" << endl;
|
||||
|
||||
start = high_resolution_clock::now();
|
||||
result = v.array().tan();
|
||||
end = high_resolution_clock::now();
|
||||
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
||||
cout << "tan() time: " << elapsed_ms << " ms" << endl;
|
||||
|
||||
start = high_resolution_clock::now();
|
||||
result = v.array().atan();
|
||||
end = high_resolution_clock::now();
|
||||
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
||||
cout << "atan() time: " << elapsed_ms << " ms" << endl;
|
||||
|
||||
start = high_resolution_clock::now();
|
||||
result = v.array().tanh();
|
||||
end = high_resolution_clock::now();
|
||||
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
||||
cout << "tanh() time: " << elapsed_ms << " ms" << endl;
|
||||
|
||||
VectorXd v2 = VectorXd::Random(size);
|
||||
start = high_resolution_clock::now();
|
||||
result = v.array() + v2.array();
|
||||
end = high_resolution_clock::now();
|
||||
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
||||
cout << "add() time: " << elapsed_ms << " ms" << endl;
|
||||
|
||||
start = high_resolution_clock::now();
|
||||
result = v.array().pow(2.0);
|
||||
end = high_resolution_clock::now();
|
||||
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
||||
cout << "pow() time: " << elapsed_ms << " ms" << endl;
|
||||
|
||||
start = high_resolution_clock::now();
|
||||
result = v.array().max(v2.array());
|
||||
end = high_resolution_clock::now();
|
||||
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
||||
cout << "max() time: " << elapsed_ms << " ms" << endl;
|
||||
|
||||
start = high_resolution_clock::now();
|
||||
result = v.array().min(v2.array());
|
||||
end = high_resolution_clock::now();
|
||||
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
||||
cout << "min() time: " << elapsed_ms << " ms" << endl;
|
||||
}
|
||||
|
||||
// Function to benchmark BLAS operation: Matrix multiplication.
|
||||
void benchmarkMatrixMultiplication(int matSize) {
|
||||
cout << "\n--- BLIS-st DGEMM Benchmark (" << matSize << " x " << matSize
|
||||
<< ") ---" << endl;
|
||||
|
||||
MatrixXd A = MatrixXd::Random(matSize, matSize);
|
||||
MatrixXd B = MatrixXd::Random(matSize, matSize);
|
||||
MatrixXd C(matSize, matSize);
|
||||
|
||||
auto start = high_resolution_clock::now();
|
||||
C = A * B;
|
||||
auto end = high_resolution_clock::now();
|
||||
double elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
||||
cout << "Matrix multiplication time: " << elapsed_ms << " ms" << endl;
|
||||
}
|
||||
|
||||
// Benchmark BLIS directly using its CBLAS interface if available.
|
||||
void benchmarkBlisMultithreaded(int matSize, int numThreads) {
|
||||
#if defined(EIGEN_AOCL_USE_BLIS_MT)
|
||||
cout << "\n--- BLIS-mt DGEMM Benchmark (" << matSize << " x " << matSize
|
||||
<< ", threads=" << numThreads << ") ---" << endl;
|
||||
vector<double> A(matSize * matSize);
|
||||
vector<double> B(matSize * matSize);
|
||||
vector<double> C(matSize * matSize);
|
||||
for (auto &v : A)
|
||||
v = static_cast<double>(rand()) / RAND_MAX;
|
||||
for (auto &v : B)
|
||||
v = static_cast<double>(rand()) / RAND_MAX;
|
||||
double alpha = 1.0, beta = 0.0;
|
||||
string th = to_string(numThreads);
|
||||
setenv("BLIS_NUM_THREADS", th.c_str(), 1);
|
||||
auto start = high_resolution_clock::now();
|
||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, matSize, matSize,
|
||||
matSize, alpha, A.data(), matSize, B.data(), matSize, beta,
|
||||
C.data(), matSize);
|
||||
auto end = high_resolution_clock::now();
|
||||
double elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
||||
cout << "BLIS dgemm time: " << elapsed_ms << " ms" << endl;
|
||||
#else
|
||||
(void)matSize;
|
||||
(void)numThreads;
|
||||
cout << "\nBLIS multithreaded support not enabled." << endl;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Function to benchmark LAPACK operation: Eigenvalue decomposition.
|
||||
void benchmarkEigenDecomposition(int matSize) {
|
||||
cout << "\n--- Eigenvalue Decomposition Benchmark (Matrix Size: " << matSize
|
||||
<< " x " << matSize << ") ---" << endl;
|
||||
MatrixXd M = MatrixXd::Random(matSize, matSize);
|
||||
// Make matrix symmetric (necessary for eigenvalue decomposition of
|
||||
// self-adjoint matrices)
|
||||
M = (M + M.transpose()) * 0.5;
|
||||
|
||||
SelfAdjointEigenSolver<MatrixXd> eigensolver;
|
||||
auto start = high_resolution_clock::now();
|
||||
eigensolver.compute(M);
|
||||
auto end = high_resolution_clock::now();
|
||||
double elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
||||
if (eigensolver.info() == Success) {
|
||||
cout << "Eigenvalue decomposition time: " << elapsed_ms << " ms" << endl;
|
||||
} else {
|
||||
cout << "Eigenvalue decomposition failed." << endl;
|
||||
}
|
||||
}
|
||||
|
||||
// Function simulating a real-world FSI risk computation scenario.
|
||||
// Example: Compute covariance matrix from simulated asset returns, then perform
|
||||
// eigenvalue decomposition.
|
||||
void benchmarkFSIRiskComputation(int numPeriods, int numAssets) {
|
||||
cout << "\n--- FSI Risk Computation Benchmark ---" << endl;
|
||||
cout << "Simulating " << numPeriods << " periods for " << numAssets
|
||||
<< " assets." << endl;
|
||||
|
||||
// Simulate asset returns: each column represents an asset's returns.
|
||||
MatrixXd returns = MatrixXd::Random(numPeriods, numAssets);
|
||||
|
||||
// Compute covariance matrix: cov = (returns^T * returns) / (numPeriods - 1)
|
||||
auto start = high_resolution_clock::now();
|
||||
MatrixXd cov = (returns.transpose() * returns) / (numPeriods - 1);
|
||||
auto end = high_resolution_clock::now();
|
||||
double cov_time = duration_cast<milliseconds>(end - start).count();
|
||||
cout << "Covariance matrix computation time: " << cov_time << " ms" << endl;
|
||||
|
||||
// Eigenvalue decomposition on covariance matrix.
|
||||
SelfAdjointEigenSolver<MatrixXd> eigensolver;
|
||||
start = high_resolution_clock::now();
|
||||
eigensolver.compute(cov);
|
||||
end = high_resolution_clock::now();
|
||||
double eig_time = duration_cast<milliseconds>(end - start).count();
|
||||
if (eigensolver.info() == Success) {
|
||||
cout << "Eigenvalue decomposition (covariance) time: " << eig_time << " ms"
|
||||
<< endl;
|
||||
cout << "Top 3 Eigenvalues: "
|
||||
<< eigensolver.eigenvalues().tail(3).transpose() << endl;
|
||||
} else {
|
||||
cout << "Eigenvalue decomposition failed." << endl;
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
cout << "=== AOCL Benchmark for Eigen on AMD Platforms ===" << endl;
|
||||
cout << "Developer: Sharad Saurabh Bhaskar (shbhaska@amd.com)" << endl;
|
||||
cout << "Organization: Advanced Micro Devices, Inc." << endl;
|
||||
cout << "License: Mozilla Public License 2.0" << endl << endl;
|
||||
|
||||
// Print AOCL configuration
|
||||
#ifdef EIGEN_USE_AOCL_MT
|
||||
cout << "AOCL Mode: MULTITHREADED (MT)" << endl;
|
||||
cout << "Features: Multithreaded BLIS, AOCL VML, LAPACK" << endl;
|
||||
#elif defined(EIGEN_USE_AOCL_ALL)
|
||||
cout << "AOCL Mode: SINGLE-THREADED (ALL)" << endl;
|
||||
cout << "Features: Single-threaded BLIS, AOCL VML, LAPACK" << endl;
|
||||
#else
|
||||
cout << "AOCL Mode: DISABLED" << endl;
|
||||
cout << "Using standard Eigen implementation" << endl;
|
||||
#endif
|
||||
cout << "Hardware threads available: " << thread::hardware_concurrency() << endl << endl;
|
||||
|
||||
// Benchmark vector math functions with varying vector sizes.
|
||||
vector<int> vectorSizes = {5000000, 10000000, 50000000};
|
||||
for (int size : vectorSizes) {
|
||||
benchmarkVectorMath(size);
|
||||
}
|
||||
|
||||
// Benchmark matrix multiplication for varying sizes.
|
||||
vector<int> matrixSizes = {1024};
|
||||
for (int msize : matrixSizes) {
|
||||
benchmarkMatrixMultiplication(msize);
|
||||
#if defined(EIGEN_AOCL_USE_BLIS_MT)
|
||||
benchmarkBlisMultithreaded(msize, thread::hardware_concurrency());
|
||||
#endif
|
||||
}
|
||||
|
||||
// Benchmark LAPACK: Eigenvalue Decomposition.
|
||||
for (int msize : matrixSizes) {
|
||||
benchmarkEigenDecomposition(msize);
|
||||
}
|
||||
|
||||
// Benchmark a complex FSI risk computation scenario.
|
||||
// For example, simulate 10,000 time periods (days) for 500 assets.
|
||||
benchmarkFSIRiskComputation(10000, 500);
|
||||
|
||||
cout << "\n=== Benchmark Complete ===" << endl;
|
||||
return 0;
|
||||
}
|
||||
@@ -21,7 +21,9 @@ list(APPEND EIGEN_BLAS_TARGETS eigen_blas_static)
|
||||
if (EIGEN_BUILD_SHARED_LIBS)
|
||||
add_library(eigen_blas SHARED ${EigenBlas_SRCS} "eigen_blas.def")
|
||||
target_compile_definitions(eigen_blas PUBLIC "EIGEN_BLAS_BUILD_DLL")
|
||||
set_target_properties(eigen_blas PROPERTIES CXX_VISIBILITY_PRESET hidden)
|
||||
set_target_properties(eigen_blas PROPERTIES CXX_VISIBILITY_PRESET hidden
|
||||
VERSION ${EIGEN_VERSION_NUMBER}
|
||||
SOVERSION ${EIGEN_MAJOR_VERSION})
|
||||
list(APPEND EIGEN_BLAS_TARGETS eigen_blas)
|
||||
endif()
|
||||
|
||||
|
||||
13
blas/blas.h
13
blas/blas.h
@@ -59,6 +59,19 @@ EIGEN_BLAS_API void BLASFUNC(caxpyc)(const int *, const float *, const float *,
|
||||
EIGEN_BLAS_API void BLASFUNC(zaxpyc)(const int *, const double *, const double *, const int *, double *, const int *);
|
||||
EIGEN_BLAS_API void BLASFUNC(xaxpyc)(const int *, const double *, const double *, const int *, double *, const int *);
|
||||
|
||||
EIGEN_BLAS_API void BLASFUNC(saxpby)(const int *, const float *, const float *, const int *, const float *, float *,
|
||||
const int *);
|
||||
EIGEN_BLAS_API void BLASFUNC(daxpby)(const int *, const double *, const double *, const int *, const double *, double *,
|
||||
const int *);
|
||||
EIGEN_BLAS_API void BLASFUNC(qaxpby)(const int *, const double *, const double *, const int *, const double *, double *,
|
||||
const int *);
|
||||
EIGEN_BLAS_API void BLASFUNC(caxpby)(const int *, const float *, const float *, const int *, const float *, float *,
|
||||
const int *);
|
||||
EIGEN_BLAS_API void BLASFUNC(zaxpby)(const int *, const double *, const double *, const int *, const double *, double *,
|
||||
const int *);
|
||||
EIGEN_BLAS_API void BLASFUNC(xaxpby)(const int *, const double *, const double *, const int *, const double *, double *,
|
||||
const int *);
|
||||
|
||||
EIGEN_BLAS_API void BLASFUNC(scopy)(int *, float *, int *, float *, int *);
|
||||
EIGEN_BLAS_API void BLASFUNC(dcopy)(int *, double *, int *, double *, int *);
|
||||
EIGEN_BLAS_API void BLASFUNC(qcopy)(int *, double *, int *, double *, int *);
|
||||
|
||||
@@ -13,6 +13,10 @@ EXPORTS
|
||||
zaxpy_
|
||||
; caxpyc_
|
||||
; zaxpyc_
|
||||
saxpby_,
|
||||
daxpby_,
|
||||
caxpby_,
|
||||
zaxpby_,
|
||||
scopy_
|
||||
dcopy_
|
||||
ccopy_
|
||||
@@ -91,8 +95,8 @@ EXPORTS
|
||||
; dmin_
|
||||
; cmin_
|
||||
; zmin_
|
||||
|
||||
|
||||
|
||||
|
||||
; Level 2
|
||||
sgemv_
|
||||
dgemv_
|
||||
|
||||
@@ -22,11 +22,42 @@ EIGEN_BLAS_FUNC(axpy)
|
||||
else if (*incx > 0 && *incy > 0)
|
||||
make_vector(y, *n, *incy) += alpha * make_vector(x, *n, *incx);
|
||||
else if (*incx > 0 && *incy < 0)
|
||||
make_vector(y, *n, -*incy).reverse() += alpha * make_vector(x, *n, *incx);
|
||||
make_vector(y, *n, -*incy) += alpha * make_vector(x, *n, *incx).reverse();
|
||||
else if (*incx < 0 && *incy > 0)
|
||||
make_vector(y, *n, *incy) += alpha * make_vector(x, *n, -*incx).reverse();
|
||||
else if (*incx < 0 && *incy < 0)
|
||||
make_vector(y, *n, -*incy).reverse() += alpha * make_vector(x, *n, -*incx).reverse();
|
||||
make_vector(y, *n, -*incy) += alpha * make_vector(x, *n, -*incx);
|
||||
}
|
||||
|
||||
EIGEN_BLAS_FUNC(axpby)
|
||||
(const int *pn, const RealScalar *palpha, const RealScalar *px, const int *pincx, const RealScalar *pbeta,
|
||||
RealScalar *py, const int *pincy) {
|
||||
const Scalar *x = reinterpret_cast<const Scalar *>(px);
|
||||
Scalar *y = reinterpret_cast<Scalar *>(py);
|
||||
const Scalar alpha = *reinterpret_cast<const Scalar *>(palpha);
|
||||
const Scalar beta = *reinterpret_cast<const Scalar *>(pbeta);
|
||||
const int n = *pn;
|
||||
|
||||
if (n <= 0) return;
|
||||
|
||||
if (Eigen::numext::equal_strict(beta, Scalar(1))) {
|
||||
EIGEN_BLAS_FUNC_NAME(axpy)(pn, palpha, px, pincx, py, pincy);
|
||||
return;
|
||||
}
|
||||
|
||||
const int incx = *pincx;
|
||||
const int incy = *pincy;
|
||||
|
||||
if (incx == 1 && incy == 1)
|
||||
make_vector(y, n) = alpha * make_vector(x, n) + beta * make_vector(y, n);
|
||||
else if (incx > 0 && incy > 0)
|
||||
make_vector(y, n, incy) = alpha * make_vector(x, n, incx) + beta * make_vector(y, n, incy);
|
||||
else if (incx > 0 && incy < 0)
|
||||
make_vector(y, n, -incy) = alpha * make_vector(x, n, incx).reverse() + beta * make_vector(y, n, -incy);
|
||||
else if (incx < 0 && incy > 0)
|
||||
make_vector(y, n, incy) = alpha * make_vector(x, n, -incx).reverse() + beta * make_vector(y, n, incy);
|
||||
else if (incx < 0 && incy < 0)
|
||||
make_vector(y, n, -incy) = alpha * make_vector(x, n, -incx) + beta * make_vector(y, n, -incy);
|
||||
}
|
||||
|
||||
EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy) {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# This is a CMake version file for the Config-mode of find_package().
|
||||
#
|
||||
#
|
||||
# The version constraint is compatible with the current package under the
|
||||
# following conditions:
|
||||
# - If a version range is specified, the package version falls within the
|
||||
@@ -12,7 +12,8 @@
|
||||
# - 3...<5 matches 3.0.0.0 to <5.0.0.0
|
||||
# - 3...<5.1 matches 3.0.0.0 to <5.1.0.0
|
||||
# - 3 matches 3.0.0.0 to <4.0.0.0
|
||||
# - 3.4 matches 3.4.0.0 to <3.5.0.0
|
||||
# - 3.4 matches 3.4.0.0 to <4.0.0.0 due to semantic versioning
|
||||
# - 3.4 EXACT matches 3.4.0.0 to <3.5.0.0
|
||||
|
||||
set(PACKAGE_VERSION "@CVF_VERSION@")
|
||||
|
||||
@@ -65,26 +66,30 @@ if (PACKAGE_FIND_VERSION_RANGE)
|
||||
set(PACKAGE_VERSION_COMPATIBLE TRUE)
|
||||
endif()
|
||||
else()
|
||||
# Create exclusive upper bound.
|
||||
|
||||
# Semantic versioning upper bound.
|
||||
math(EXPR _PACKAGE_FIND_VERSION_MAJOR "${PACKAGE_FIND_VERSION_MAJOR}+1")
|
||||
set(_PACKAGE_FIND_VERSION_SEMVER_UPPER "${_PACKAGE_FIND_VERSION_MAJOR}.0.0.0")
|
||||
|
||||
# Create exclusive upper bound for exact match.
|
||||
if (PACKAGE_FIND_VERSION_COUNT EQUAL 1)
|
||||
math(EXPR _PACKAGE_FIND_VERSION_MAJOR "${PACKAGE_FIND_VERSION_MAJOR}+1")
|
||||
set(_PACKAGE_FIND_VERSION_UPPER "${_PACKAGE_FIND_VERSION_MAJOR}.0.0.0")
|
||||
set(_PACKAGE_FIND_VERSION_EXACT_UPPER "${_PACKAGE_FIND_VERSION_SEMVER_UPPER}")
|
||||
elseif (PACKAGE_FIND_VERSION_COUNT EQUAL 2)
|
||||
math(EXPR _PACKAGE_FIND_VERSION_MINOR "${PACKAGE_FIND_VERSION_MINOR}+1")
|
||||
set(_PACKAGE_FIND_VERSION_UPPER "${PACKAGE_FIND_VERSION_MAJOR}.${_PACKAGE_FIND_VERSION_MINOR}.0.0")
|
||||
set(_PACKAGE_FIND_VERSION_EXACT_UPPER "${PACKAGE_FIND_VERSION_MAJOR}.${_PACKAGE_FIND_VERSION_MINOR}.0.0")
|
||||
elseif (PACKAGE_FIND_VERSION_COUNT EQUAL 3)
|
||||
math(EXPR _PACKAGE_FIND_VERSION_PATCH "${PACKAGE_FIND_VERSION_PATCH}+1")
|
||||
set(_PACKAGE_FIND_VERSION_UPPER "${PACKAGE_FIND_VERSION_MAJOR}.${PACKAGE_FIND_VERSION_MINOR}.${_PACKAGE_FIND_VERSION_PATCH}.0")
|
||||
set(_PACKAGE_FIND_VERSION_EXACT_UPPER "${PACKAGE_FIND_VERSION_MAJOR}.${PACKAGE_FIND_VERSION_MINOR}.${_PACKAGE_FIND_VERSION_PATCH}.0")
|
||||
elseif (PACKAGE_FIND_VERSION_COUNT EQUAL 4)
|
||||
math(EXPR _PACKAGE_FIND_VERSION_TWEAK "${PACKAGE_FIND_VERSION_TWEAK}+1")
|
||||
set(_PACKAGE_FIND_VERSION_UPPER "${PACKAGE_FIND_VERSION_MAJOR}.${PACKAGE_FIND_VERSION_MINOR}.${PACKAGE_FIND_VERSION_PATCH}.${_PACKAGE_FIND_VERSION_TWEAK}")
|
||||
set(_PACKAGE_FIND_VERSION_EXACT_UPPER "${PACKAGE_FIND_VERSION_MAJOR}.${PACKAGE_FIND_VERSION_MINOR}.${PACKAGE_FIND_VERSION_PATCH}.${_PACKAGE_FIND_VERSION_TWEAK}")
|
||||
endif()
|
||||
|
||||
if((_PACKAGE_VERSION_FULL VERSION_LESS PACKAGE_FIND_VERSION) OR (_PACKAGE_VERSION_FULL VERSION_GREATER_EQUAL _PACKAGE_FIND_VERSION_UPPER))
|
||||
if((_PACKAGE_VERSION_FULL VERSION_LESS PACKAGE_FIND_VERSION) OR (_PACKAGE_VERSION_FULL VERSION_GREATER_EQUAL _PACKAGE_FIND_VERSION_SEMVER_UPPER))
|
||||
set(PACKAGE_VERSION_COMPATIBLE FALSE)
|
||||
else()
|
||||
set(PACKAGE_VERSION_COMPATIBLE TRUE)
|
||||
if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
|
||||
if(_PACKAGE_VERSION_FULL VERSION_LESS _PACKAGE_FIND_VERSION_EXACT_UPPER)
|
||||
set(PACKAGE_VERSION_EXACT TRUE)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
292
cmake/FindAOCL.cmake
Normal file
292
cmake/FindAOCL.cmake
Normal file
@@ -0,0 +1,292 @@
|
||||
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
||||
#
|
||||
# FindAOCL.cmake - CMake Module for AMD Optimizing CPU Libraries (AOCL)
|
||||
#
|
||||
# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Description:
|
||||
# ------------
|
||||
# This CMake module locates and configures AMD Optimizing CPU Libraries (AOCL)
|
||||
# for high-performance mathematical computing on AMD processors. It searches for
|
||||
# and sets up the following AOCL components:
|
||||
#
|
||||
# 1. AOCL MathLib (libamdlibm): Vector Math Library providing optimized
|
||||
# transcendental functions (exp, sin, cos, sqrt, log, etc.) with VRDA
|
||||
# (Vector Rapid Double-precision Arithmetic) support for SIMD acceleration
|
||||
#
|
||||
# 2. AOCL BLAS (BLIS): Basic Linear Algebra Subprograms optimized for AMD
|
||||
# architectures, supporting both single-threaded (libblis) and multithreaded
|
||||
# (libblis-mt) execution with OpenMP parallelization
|
||||
#
|
||||
# 3. AOCL LAPACK (libflame): Linear Algebra PACKage providing dense matrix
|
||||
# factorizations, eigenvalue/eigenvector computations, and linear system
|
||||
# solvers optimized for AMD processors
|
||||
#
|
||||
# The module automatically detects the appropriate library variants based on
|
||||
# configuration flags and provides proper linking setup for optimal performance
|
||||
# on Zen, Zen2, Zen3, Zen4, and Zen5 architectures.
|
||||
#
|
||||
# Variables Set:
|
||||
# --------------
|
||||
# AOCL_FOUND - True if AOCL libraries are found
|
||||
# AOCL_LIBRARIES - List of AOCL libraries to link against
|
||||
# AOCL_INCLUDE_DIRS - Include directories for AOCL headers
|
||||
# AOCL_BLAS_TYPE - Type of BLIS library found ("multithreaded" or "single-threaded")
|
||||
# AOCL_CORE_LIB - Path to core AOCL math library
|
||||
# AOCL_BLAS_LIB - Path to AOCL BLAS library
|
||||
# AOCL_LAPACK_LIB - Path to AOCL LAPACK library
|
||||
#
|
||||
# Configuration Options:
|
||||
# ----------------------
|
||||
# EIGEN_AOCL_BENCH_USE_MT - When ON, searches for multithreaded BLIS first
|
||||
# When OFF, searches for single-threaded BLIS only
|
||||
#
|
||||
# # For multithreaded BLIS:
|
||||
# cmake .. -DEIGEN_AOCL_BENCH_USE_MT=ON
|
||||
#
|
||||
# # For single-threaded BLIS:
|
||||
# cmake .. -DEIGEN_AOCL_BENCH_USE_MT=OFF
|
||||
#
|
||||
# Library Search Paths:
|
||||
# ---------------------
|
||||
# The module searches for AOCL libraries in the following order:
|
||||
# 1. ${AOCL_ROOT}/lib (or ${AOCL_ROOT}/lib32 for 32-bit)
|
||||
# 2. /opt/amd/aocl/lib64 (or /opt/amd/aocl/lib32 for 32-bit)
|
||||
# 3. ${LIB_INSTALL_DIR}
|
||||
#
|
||||
# Expected Library Names:
|
||||
# -----------------------
|
||||
# Core MathLib: amdlibm, alm, almfast
|
||||
# BLAS Single: blis
|
||||
# BLAS Multi: blis-mt
|
||||
# LAPACK: flame
|
||||
#
|
||||
# Dependencies:
|
||||
# -------------
|
||||
# The module automatically links the following system libraries:
|
||||
# - libm (standard math library)
|
||||
# - libpthread (POSIX threads)
|
||||
# - librt (real-time extensions)
|
||||
#
|
||||
# Architecture Support:
|
||||
# ---------------------
|
||||
# Optimized for AMD Zen family processors (Zen, Zen2, Zen3, Zen4, Zen5)
|
||||
# with automatic architecture detection and SIMD instruction selection.
|
||||
#
|
||||
# Developer:
|
||||
# ----------
|
||||
# Name: Sharad Saurabh Bhaskar
|
||||
# Email: shbhaska@amd.com
|
||||
#
|
||||
|
||||
if(NOT DEFINED AOCL_ROOT)
|
||||
if(DEFINED ENV{AOCL_ROOT})
|
||||
set(AOCL_ROOT $ENV{AOCL_ROOT})
|
||||
if (NOT AOCL_FIND_QUIETLY)
|
||||
message(STATUS "AOCL_ROOT set from environment: ${AOCL_ROOT}")
|
||||
endif()
|
||||
else()
|
||||
if (NOT AOCL_FIND_QUIETLY)
|
||||
message(WARNING "AOCL_ROOT is not set. AOCL support will be disabled.")
|
||||
endif()
|
||||
set(AOCL_LIBRARIES "")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(AOCL_LIBRARIES)
|
||||
set(AOCL_FIND_QUIETLY TRUE)
|
||||
endif()
|
||||
|
||||
# Determine default include directories
|
||||
set(AOCL_INCLUDE_DIRS "")
|
||||
if(AOCL_ROOT AND EXISTS "${AOCL_ROOT}/include")
|
||||
list(APPEND AOCL_INCLUDE_DIRS "${AOCL_ROOT}/include")
|
||||
endif()
|
||||
if(EXISTS "/opt/amd/aocl/include")
|
||||
list(APPEND AOCL_INCLUDE_DIRS "/opt/amd/aocl/include")
|
||||
endif()
|
||||
|
||||
if(${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "x86_64")
|
||||
# Search for the core AOCL math library.
|
||||
find_library(AOCL_CORE_LIB
|
||||
NAMES amdlibm alm almfast
|
||||
PATHS
|
||||
${AOCL_ROOT}/lib
|
||||
/opt/amd/aocl/lib64
|
||||
${LIB_INSTALL_DIR}
|
||||
)
|
||||
if (NOT AOCL_FIND_QUIETLY)
|
||||
if(AOCL_CORE_LIB)
|
||||
message(STATUS "Found AOCL core library: ${AOCL_CORE_LIB}")
|
||||
else()
|
||||
message(WARNING "AOCL core library not found in ${AOCL_ROOT}/lib or default locations.")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Conditional BLIS library search based on MT requirement
|
||||
if(EIGEN_AOCL_BENCH_USE_MT)
|
||||
# Search for multithreaded BLIS first
|
||||
find_library(AOCL_BLAS_LIB
|
||||
NAMES blis-mt
|
||||
PATHS
|
||||
${AOCL_ROOT}/lib
|
||||
/opt/amd/aocl/lib64
|
||||
${LIB_INSTALL_DIR}
|
||||
)
|
||||
if(AOCL_BLAS_LIB)
|
||||
if (NOT AOCL_FIND_QUIETLY)
|
||||
message(STATUS "Found AOCL BLAS (MT) library: ${AOCL_BLAS_LIB}")
|
||||
endif()
|
||||
set(AOCL_BLAS_TYPE "multithreaded")
|
||||
else()
|
||||
if (NOT AOCL_FIND_QUIETLY)
|
||||
message(WARNING "AOCL multithreaded BLAS library not found, falling back to single-threaded.")
|
||||
endif()
|
||||
find_library(AOCL_BLAS_LIB
|
||||
NAMES blis
|
||||
PATHS
|
||||
${AOCL_ROOT}/lib
|
||||
/opt/amd/aocl/lib64
|
||||
${LIB_INSTALL_DIR}
|
||||
)
|
||||
set(AOCL_BLAS_TYPE "single-threaded")
|
||||
endif()
|
||||
else()
|
||||
# Search for single-threaded BLIS
|
||||
find_library(AOCL_BLAS_LIB
|
||||
NAMES blis
|
||||
PATHS
|
||||
${AOCL_ROOT}/lib
|
||||
/opt/amd/aocl/lib64
|
||||
${LIB_INSTALL_DIR}
|
||||
)
|
||||
if(AOCL_BLAS_LIB)
|
||||
if (NOT AOCL_FIND_QUIETLY)
|
||||
message(STATUS "Found AOCL BLAS (ST) library: ${AOCL_BLAS_LIB}")
|
||||
endif()
|
||||
set(AOCL_BLAS_TYPE "single-threaded")
|
||||
else()
|
||||
if (NOT AOCL_FIND_QUIETLY)
|
||||
message(WARNING "AOCL single-threaded BLAS library not found.")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Now search for AOCL LAPACK library.
|
||||
find_library(AOCL_LAPACK_LIB
|
||||
NAMES flame
|
||||
PATHS
|
||||
${AOCL_ROOT}/lib
|
||||
/opt/amd/aocl/lib64
|
||||
${LIB_INSTALL_DIR}
|
||||
)
|
||||
if (NOT AOCL_FIND_QUIETLY)
|
||||
if(AOCL_LAPACK_LIB)
|
||||
message(STATUS "Found AOCL LAPACK library: ${AOCL_LAPACK_LIB}")
|
||||
else()
|
||||
message(WARNING "AOCL LAPACK library not found in ${AOCL_ROOT}/lib or default locations.")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
else()
|
||||
# For 32-bit systems, similar search paths.
|
||||
find_library(AOCL_CORE_LIB
|
||||
NAMES amdlibm alm almfast
|
||||
PATHS
|
||||
${AOCL_ROOT}/lib
|
||||
/opt/amd/aocl/lib32
|
||||
${LIB_INSTALL_DIR}
|
||||
)
|
||||
if (NOT AOCL_FIND_QUIETLY)
|
||||
if(AOCL_CORE_LIB)
|
||||
message(STATUS "Found AOCL core library: ${AOCL_CORE_LIB}")
|
||||
else()
|
||||
message(WARNING "AOCL core library not found in ${AOCL_ROOT}/lib or default locations.")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Conditional BLIS library search for 32-bit
|
||||
if(EIGEN_AOCL_BENCH_USE_MT)
|
||||
find_library(AOCL_BLAS_LIB
|
||||
NAMES blis-mt
|
||||
PATHS
|
||||
${AOCL_ROOT}/lib
|
||||
/opt/amd/aocl/lib32
|
||||
${LIB_INSTALL_DIR}
|
||||
)
|
||||
if(AOCL_BLAS_LIB)
|
||||
if (NOT AOCL_FIND_QUIETLY)
|
||||
message(STATUS "Found AOCL BLAS (MT) library: ${AOCL_BLAS_LIB}")
|
||||
endif()
|
||||
set(AOCL_BLAS_TYPE "multithreaded")
|
||||
else()
|
||||
if (NOT AOCL_FIND_QUIETLY)
|
||||
message(WARNING "AOCL multithreaded BLAS library not found, falling back to single-threaded.")
|
||||
endif()
|
||||
find_library(AOCL_BLAS_LIB
|
||||
NAMES blis
|
||||
PATHS
|
||||
${AOCL_ROOT}/lib
|
||||
/opt/amd/aocl/lib32
|
||||
${LIB_INSTALL_DIR}
|
||||
)
|
||||
set(AOCL_BLAS_TYPE "single-threaded")
|
||||
endif()
|
||||
else()
|
||||
find_library(AOCL_BLAS_LIB
|
||||
NAMES blis
|
||||
PATHS
|
||||
${AOCL_ROOT}/lib
|
||||
/opt/amd/aocl/lib32
|
||||
${LIB_INSTALL_DIR}
|
||||
)
|
||||
if(AOCL_BLAS_LIB)
|
||||
if (NOT AOCL_FIND_QUIETLY)
|
||||
message(STATUS "Found AOCL BLAS (ST) library: ${AOCL_BLAS_LIB}")
|
||||
endif()
|
||||
set(AOCL_BLAS_TYPE "single-threaded")
|
||||
else()
|
||||
if (NOT AOCL_FIND_QUIETLY)
|
||||
message(WARNING "AOCL single-threaded BLAS library not found.")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
find_library(AOCL_LAPACK_LIB
|
||||
NAMES flame
|
||||
PATHS
|
||||
${AOCL_ROOT}/lib
|
||||
/opt/amd/aocl/lib32
|
||||
${LIB_INSTALL_DIR}
|
||||
)
|
||||
if (NOT AOCL_FIND_QUIETLY)
|
||||
if(AOCL_LAPACK_LIB)
|
||||
message(STATUS "Found AOCL LAPACK library: ${AOCL_LAPACK_LIB}")
|
||||
else()
|
||||
message(WARNING "AOCL LAPACK library not found in ${AOCL_ROOT}/lib or default locations.")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Combine the found libraries into one variable.
|
||||
if(AOCL_CORE_LIB)
|
||||
set(AOCL_LIBRARIES ${AOCL_CORE_LIB})
|
||||
endif()
|
||||
if(AOCL_BLAS_LIB)
|
||||
list(APPEND AOCL_LIBRARIES ${AOCL_BLAS_LIB})
|
||||
endif()
|
||||
if(AOCL_LAPACK_LIB)
|
||||
list(APPEND AOCL_LIBRARIES ${AOCL_LAPACK_LIB})
|
||||
endif()
|
||||
if(AOCL_LIBRARIES)
|
||||
# Link against the standard math and pthread libraries as well as librt
|
||||
list(APPEND AOCL_LIBRARIES m pthread rt)
|
||||
endif()
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(AOCL DEFAULT_MSG AOCL_LIBRARIES AOCL_INCLUDE_DIRS)
|
||||
mark_as_advanced(AOCL_LIBRARIES AOCL_INCLUDE_DIRS)
|
||||
289
doc/UsingAOCL.dox
Normal file
289
doc/UsingAOCL.dox
Normal file
@@ -0,0 +1,289 @@
|
||||
/*
|
||||
Copyright (c) 2025, AMD Inc. All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
* Neither the name of AMD nor the names of its contributors may
|
||||
be used to endorse or promote products derived from this software without
|
||||
specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
********************************************************************************
|
||||
* Content : Documentation on the use of AMD AOCL through Eigen
|
||||
********************************************************************************
|
||||
*/
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \page TopicUsingAOCL Using AMD® AOCL from %Eigen
|
||||
|
||||
Since %Eigen version 3.4 and later, users can benefit from built-in AMD® Optimizing CPU Libraries (AOCL) optimizations with an installed copy of AOCL 5.0 (or later).
|
||||
|
||||
<a href="https://www.amd.com/en/developer/aocl.html"> AMD AOCL </a> provides highly optimized, multi-threaded mathematical routines for x86-64 processors with a focus on AMD "Zen"-based architectures. AOCL is available on Linux and Windows for x86-64 architectures.
|
||||
|
||||
\note
|
||||
AMD® AOCL is freely available software, but it is the responsibility of users to download, install, and ensure their product's license allows linking to the AOCL libraries. AOCL is distributed under a permissive license that allows commercial use.
|
||||
|
||||
Using AMD AOCL through %Eigen is straightforward:
|
||||
-# export \c AOCL_ROOT into your environment
|
||||
-# define one of the AOCL macros before including any %Eigen headers (see table below)
|
||||
-# link your program to AOCL libraries (BLIS, FLAME, LibM)
|
||||
-# ensure your system supports the target architecture optimizations
|
||||
|
||||
When doing so, a number of %Eigen's algorithms are silently substituted with calls to AMD AOCL routines.
|
||||
These substitutions apply only for \b Dynamic \b or \b large \b enough objects with one of the following standard scalar types: \c float, \c double, \c complex<float>, and \c complex<double>.
|
||||
Operations on other scalar types or mixing reals and complexes will continue to use the built-in algorithms.
|
||||
|
||||
The AOCL integration targets three core components:
|
||||
- **BLIS**: High-performance BLAS implementation optimized for modern cache hierarchies
|
||||
- **FLAME**: Dense linear algebra algorithms providing LAPACK functionality
|
||||
- **LibM**: Optimized standard math routines with vectorized implementations
|
||||
|
||||
\section TopicUsingAOCL_Macros Configuration Macros
|
||||
|
||||
You can choose which parts will be substituted by defining one or multiple of the following macros:
|
||||
|
||||
<table class="manual">
|
||||
<tr><td>\c EIGEN_USE_BLAS </td><td>Enables the use of external BLAS level 2 and 3 routines (AOCL-BLIS)</td></tr>
|
||||
<tr class="alt"><td>\c EIGEN_USE_LAPACKE </td><td>Enables the use of external LAPACK routines via the LAPACKE C interface (AOCL-FLAME)</td></tr>
|
||||
<tr><td>\c EIGEN_USE_LAPACKE_STRICT </td><td>Same as \c EIGEN_USE_LAPACKE but algorithms of lower robustness are disabled. \n This currently concerns only JacobiSVD which would be replaced by \c gesvd.</td></tr>
|
||||
<tr class="alt"><td>\c EIGEN_USE_AOCL_VML </td><td>Enables the use of AOCL LibM vector math operations for coefficient-wise functions</td></tr>
|
||||
<tr><td>\c EIGEN_USE_AOCL_ALL </td><td>Defines \c EIGEN_USE_BLAS, \c EIGEN_USE_LAPACKE, and \c EIGEN_USE_AOCL_VML</td></tr>
|
||||
<tr class="alt"><td>\c EIGEN_USE_AOCL_MT </td><td>Equivalent to \c EIGEN_USE_AOCL_ALL, but ensures multi-threaded BLIS (\c libblis-mt) is used. \n \b Recommended for most applications.</td></tr>
|
||||
</table>
|
||||
|
||||
\note The AOCL integration automatically enables optimizations when the matrix/vector size exceeds \c EIGEN_AOCL_VML_THRESHOLD (default: 128 elements). For smaller operations, Eigen's built-in vectorization may be faster due to function call overhead.
|
||||
|
||||
\section TopicUsingAOCL_Performance Performance Considerations
|
||||
|
||||
The \c EIGEN_USE_BLAS and \c EIGEN_USE_LAPACKE macros can be combined with AOCL-specific optimizations:
|
||||
|
||||
- **Multi-threading**: Use \c EIGEN_USE_AOCL_MT to automatically select the multi-threaded BLIS library
|
||||
- **Architecture targeting**: AOCL libraries are optimized for AMD Zen architectures (Zen, Zen2, Zen3, Zen4, Zen5)
|
||||
- **Vector Math Library**: AOCL LibM provides vectorized implementations that can operate on entire arrays simultaneously
|
||||
- **Memory layout**: Eigen's column-major storage directly matches AOCL's expected data layout for zero-copy operation
|
||||
|
||||
\section TopicUsingAOCL_Types Supported Data Types and Sizes
|
||||
|
||||
AOCL acceleration is applied to:
|
||||
- **Scalar types**: \c float, \c double, \c complex<float>, \c complex<double>
|
||||
- **Matrix/Vector sizes**: Dynamic size or compile-time size ≥ \c EIGEN_AOCL_VML_THRESHOLD
|
||||
- **Storage order**: Both column-major (default) and row-major layouts
|
||||
- **Memory alignment**: Eigen's data pointers are directly compatible with AOCL function signatures
|
||||
|
||||
The current AOCL Vector Math Library integration is specialized for \c double precision, with automatic fallback to scalar implementations for \c float.
|
||||
|
||||
\section TopicUsingAOCL_Functions Vector Math Functions
|
||||
|
||||
The following table summarizes coefficient-wise operations accelerated by \c EIGEN_USE_AOCL_VML:
|
||||
|
||||
<table class="manual">
|
||||
<tr><th>Code example</th><th>AOCL routines</th></tr>
|
||||
<tr><td>\code
|
||||
v2 = v1.array().exp();
|
||||
v2 = v1.array().sin();
|
||||
v2 = v1.array().cos();
|
||||
v2 = v1.array().tan();
|
||||
v2 = v1.array().log();
|
||||
v2 = v1.array().log10();
|
||||
v2 = v1.array().log2();
|
||||
v2 = v1.array().sqrt();
|
||||
v2 = v1.array().pow(1.5);
|
||||
v2 = v1.array() + v2.array();
|
||||
\endcode</td><td>\code
|
||||
amd_vrda_exp
|
||||
amd_vrda_sin
|
||||
amd_vrda_cos
|
||||
amd_vrda_tan
|
||||
amd_vrda_log
|
||||
amd_vrda_log10
|
||||
amd_vrda_log2
|
||||
amd_vrda_sqrt
|
||||
amd_vrda_pow
|
||||
amd_vrda_add
|
||||
\endcode</td></tr>
|
||||
</table>
|
||||
|
||||
In the examples, v1 and v2 are dense vectors of type \c VectorXd with size ≥ \c EIGEN_AOCL_VML_THRESHOLD.
|
||||
|
||||
\section TopicUsingAOCL_Example Complete Example
|
||||
|
||||
\code
|
||||
#define EIGEN_USE_AOCL_MT
|
||||
#include <iostream>
|
||||
#include <Eigen/Dense>
|
||||
|
||||
int main() {
|
||||
const int n = 2048;
|
||||
|
||||
// Large matrices automatically use AOCL-BLIS for multiplication
|
||||
Eigen::MatrixXd A = Eigen::MatrixXd::Random(n, n);
|
||||
Eigen::MatrixXd B = Eigen::MatrixXd::Random(n, n);
|
||||
Eigen::MatrixXd C = A * B; // Dispatched to dgemm
|
||||
|
||||
// Large vectors automatically use AOCL LibM for math functions
|
||||
Eigen::VectorXd v = Eigen::VectorXd::LinSpaced(10000, 0, 10);
|
||||
Eigen::VectorXd result = v.array().sin(); // Dispatched to amd_vrda_sin
|
||||
|
||||
// LAPACK decompositions use AOCL-FLAME
|
||||
Eigen::LLT<Eigen::MatrixXd> llt(A); // Dispatched to dpotrf
|
||||
|
||||
std::cout << "Matrix norm: " << C.norm() << std::endl;
|
||||
std::cout << "Vector result norm: " << result.norm() << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
\endcode
|
||||
|
||||
\section TopicUsingAOCL_Building Building and Linking
|
||||
|
||||
To compile with AOCL support, set the \c AOCL_ROOT environment variable and link against the required libraries:
|
||||
|
||||
\code
|
||||
export AOCL_ROOT=/path/to/aocl
|
||||
clang++ -O3 -g -DEIGEN_USE_AOCL_ALL \
|
||||
-I./install/include -I${AOCL_ROOT}/include \
|
||||
-Wno-parentheses my_app.cpp \
|
||||
-L${AOCL_ROOT} -lamdlibm -lflame -lblis \
|
||||
-lpthread -lrt -lm -lomp \
|
||||
-o eigen_aocl_example
|
||||
\endcode
|
||||
|
||||
For multi-threaded performance, use the multi-threaded BLIS library:
|
||||
\code
|
||||
clang++ -O3 -g -DEIGEN_USE_AOCL_MT \
|
||||
-I./install/include -I${AOCL_ROOT}/include \
|
||||
-Wno-parentheses my_app.cpp \
|
||||
-L${AOCL_ROOT} -lamdlibm -lflame -lblis-mt \
|
||||
-lpthread -lrt -lm -lomp \
|
||||
-o eigen_aocl_example
|
||||
\endcode
|
||||
|
||||
Key compiler and linker flags:
|
||||
- \c -DEIGEN_USE_AOCL_ALL: Enable all AOCL accelerations (BLAS, LAPACK, VML)
|
||||
- \c -DEIGEN_USE_AOCL_MT: Enable multi-threaded version (uses \c -lblis-mt)
|
||||
- \c -lblis: Single-threaded BLIS library
|
||||
- \c -lblis-mt: Multi-threaded BLIS library (recommended for performance)
|
||||
- \c -lflame: FLAME LAPACK implementation
|
||||
- \c -lamdlibm: AMD LibM vector math library
|
||||
- \c -lomp: OpenMP runtime for multi-threading support
|
||||
- \c -lpthread -lrt: System threading and real-time libraries
|
||||
- \c -Wno-parentheses: Suppress common warnings when using AOCL headers
|
||||
|
||||
\subsection TopicUsingAOCL_EigenBuild Building Eigen with AOCL Support
|
||||
|
||||
To build Eigen with AOCL Support, use the following CMake configuration:
|
||||
|
||||
\code
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_C_COMPILER=clang \
|
||||
-DCMAKE_CXX_COMPILER=clang++ \
|
||||
-DCMAKE_INSTALL_PREFIX=$PWD/install \
|
||||
-DINCLUDE_INSTALL_DIR=$PWD/install/include \
|
||||
&& make install -j$(nproc)
|
||||
\endcode
|
||||
|
||||
|
||||
To build Eigen with AOCL integration and benchmarking capabilities, use the following CMake configuration:
|
||||
|
||||
\code
|
||||
cmake .. -DEIGEN_BUILD_AOCL_BENCH=ON \
|
||||
-DEIGEN_AOCL_BENCH_FLAGS="-O3 -mavx512f -fveclib=AMDLIBM" \
|
||||
-DEIGEN_AOCL_BENCH_USE_MT=OFF \
|
||||
-DEIGEN_AOCL_BENCH_ARCH=znver5 \
|
||||
-DCMAKE_BUILD_TYPE=Debug \
|
||||
-DCMAKE_C_COMPILER=clang \
|
||||
-DCMAKE_CXX_COMPILER=clang++ \
|
||||
-DCMAKE_INSTALL_PREFIX=$PWD/install \
|
||||
-DINCLUDE_INSTALL_DIR=$PWD/install/include \
|
||||
&& make install -j$(nproc)
|
||||
\endcode
|
||||
|
||||
**CMake Configuration Parameters:**
|
||||
|
||||
<table class="manual">
|
||||
<tr><th>Parameter</th><th>Expected Values</th><th>Description</th></tr>
|
||||
<tr><td>\c EIGEN_BUILD_AOCL_BENCH</td><td>\c ON, \c OFF</td><td>Enable/disable AOCL benchmark compilation</td></tr>
|
||||
<tr class="alt"><td>\c EIGEN_AOCL_BENCH_FLAGS</td><td>Compiler flags string</td><td>Additional compiler optimizations: \c "-O3 -mavx512f -fveclib=AMDLIBM"</td></tr>
|
||||
<tr><td>\c EIGEN_AOCL_BENCH_USE_MT</td><td>\c ON, \c OFF</td><td>Use multi-threaded AOCL libraries (\c ON recommended for performance)</td></tr>
|
||||
<tr class="alt"><td>\c EIGEN_AOCL_BENCH_ARCH</td><td>\c znver3, \c znver4, \c znver5, \c native, \c generic</td><td>Target AMD architecture (match your CPU generation)</td></tr>
|
||||
<tr><td>\c CMAKE_BUILD_TYPE</td><td>\c Release, \c Debug, \c RelWithDebInfo</td><td>Build configuration (\c Release recommended for benchmarks)</td></tr>
|
||||
<tr class="alt"><td>\c CMAKE_C_COMPILER</td><td>\c clang, \c gcc</td><td>C compiler (clang recommended for AOCL)</td></tr>
|
||||
<tr><td>\c CMAKE_CXX_COMPILER</td><td>\c clang++, \c g++</td><td>C++ compiler (clang++ recommended for AOCL)</td></tr>
|
||||
<tr class="alt"><td>\c CMAKE_INSTALL_PREFIX</td><td>Installation path</td><td>Where to install Eigen headers</td></tr>
|
||||
<tr><td>\c INCLUDE_INSTALL_DIR</td><td>Header path</td><td>Specific path for Eigen headers</td></tr>
|
||||
</table>
|
||||
|
||||
**Architecture Selection Guide:**
|
||||
- \c znver3: AMD Zen 3 (EPYC 7003, Ryzen 5000 series)
|
||||
- \c znver4: AMD Zen 4 (EPYC 9004, Ryzen 7000 series)
|
||||
- \c znver5: AMD Zen 5 (EPYC 9005, Ryzen 9000 series)
|
||||
- \c native: Auto-detect current CPU architecture
|
||||
- \c generic: Generic x86-64 without specific optimizations
|
||||
|
||||
**Custom Compiler Flags Explanation:**
|
||||
- \c -O3: Maximum optimization level
|
||||
- \c -mavx512f: Enable AVX-512 instruction set (if supported)
|
||||
- \c -fveclib=AMDLIBM: Use AMD LibM for vectorized math functions
|
||||
|
||||
\subsection TopicUsingAOCL_Benchmark Building the AOCL Benchmark
|
||||
|
||||
After configuring Eigen, build the AOCL benchmark executable:
|
||||
|
||||
\code
|
||||
cmake --build . --target benchmark_aocl -j$(nproc)
|
||||
\endcode
|
||||
|
||||
This creates the \c benchmark_aocl executable that demonstrates AOCL acceleration with various matrix sizes and operations.
|
||||
|
||||
**Running the Benchmark:**
|
||||
\code
|
||||
./benchmark_aocl
|
||||
\endcode
|
||||
|
||||
The benchmark will automatically compare:
|
||||
- Eigen's native performance vs AOCL-accelerated operations
|
||||
- Matrix multiplication performance (BLIS vs Eigen)
|
||||
- Vector math functions performance (LibM vs Eigen)
|
||||
- Memory bandwidth utilization and cache efficiency
|
||||
|
||||
\section TopicUsingAOCL_CMake CMake Integration
|
||||
|
||||
When using CMake, you can use a FindAOCL module:
|
||||
|
||||
\code
|
||||
find_package(AOCL REQUIRED)
|
||||
target_compile_definitions(my_target PRIVATE EIGEN_USE_AOCL_MT)
|
||||
target_link_libraries(my_target PRIVATE AOCL::BLIS_MT AOCL::FLAME AOCL::LIBM)
|
||||
\endcode
|
||||
|
||||
\section TopicUsingAOCL_Troubleshooting Troubleshooting
|
||||
|
||||
Common issues and solutions:
|
||||
|
||||
- **Link errors**: Ensure \c AOCL_ROOT is set and libraries are in \c LD_LIBRARY_PATH
|
||||
- **Performance not improved**: Verify you're using matrices/vectors larger than the threshold
|
||||
- **Thread contention**: Set \c OMP_NUM_THREADS to match your CPU core count
|
||||
- **Architecture mismatch**: Use appropriate \c -march flag for your AMD processor
|
||||
|
||||
\section TopicUsingAOCL_Links Links
|
||||
|
||||
- AMD AOCL can be downloaded for free <a href="https://www.amd.com/en/developer/aocl.html">here</a>
|
||||
- AOCL User Guide and documentation available on the AMD Developer Portal
|
||||
- AOCL is also available through package managers and containerized environments
|
||||
|
||||
*/
|
||||
|
||||
}
|
||||
@@ -110,7 +110,9 @@ if (EIGEN_BUILD_SHARED_LIBS)
|
||||
# Build LAPACK but link BLAS.
|
||||
target_compile_definitions(eigen_lapack PUBLIC "EIGEN_BLAS_LINK_DLL" "EIGEN_LAPACK_BUILD_DLL")
|
||||
target_link_libraries(eigen_lapack eigen_blas)
|
||||
set_target_properties(eigen_lapack PROPERTIES CXX_VISIBILITY_PRESET hidden)
|
||||
set_target_properties(eigen_lapack PROPERTIES CXX_VISIBILITY_PRESET hidden
|
||||
VERSION ${EIGEN_VERSION_NUMBER}
|
||||
SOVERSION ${EIGEN_MAJOR_VERSION})
|
||||
list(APPEND EIGEN_LAPACK_TARGETS eigen_lapack)
|
||||
endif()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user