0
0
mirror of https://gitlab.com/libeigen/eigen.git synced 2026-01-18 17:31:19 +01:00

Aocl integration updated

libeigen/eigen!1952
This commit is contained in:
sharad bhaskar
2025-11-24 17:20:42 +00:00
committed by Rasmus Munk Larsen
parent a6630c53c1
commit 8a1083e9bf
8 changed files with 1527 additions and 8 deletions

View File

@@ -71,6 +71,7 @@ endif()
option(EIGEN_BUILD_BTL "Build benchmark suite" OFF)
option(EIGEN_BUILD_SPBENCH "Build sparse benchmark suite" OFF)
option(EIGEN_BUILD_AOCL_BENCH "Build AOCL benchmark" OFF)
# Avoid building docs if included from another project.
# Building documentation requires creating and running executables on the host
# platform. We shouldn't do this if cross-compiling.
@@ -305,17 +306,30 @@ if (EIGEN_IS_BUILDING_)
set(CMAKE_INCLUDE_CURRENT_DIR OFF)
find_package(StandardMathLibrary)
cmake_policy(SET CMP0074 NEW)
find_package(AOCL QUIET)
set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "")
if(NOT STANDARD_MATH_LIBRARY_FOUND)
message(FATAL_ERROR
"Can't link to the standard math library. Please report to the Eigen developers, telling them about your platform.")
else()
if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO} ${STANDARD_MATH_LIBRARY}")
else()
set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${STANDARD_MATH_LIBRARY}")
if(AOCL_FOUND)
list(APPEND EIGEN_STANDARD_LIBRARIES_TO_LINK_TO ${AOCL_LIBRARIES})
if(AOCL_INCLUDE_DIRS)
include_directories(${AOCL_INCLUDE_DIRS})
endif()
endif()
if(NOT STANDARD_MATH_LIBRARY_FOUND)
message(FATAL_ERROR
"Can't link to the standard math library. Please report to the Eigen developers, telling them about your platform.")
else()
if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO} ${STANDARD_MATH_LIBRARY}")
else()
set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${STANDARD_MATH_LIBRARY}")
endif()
# Clean up any leading/trailing whitespace in the variable to avoid CMP0004 errors
string(STRIP "${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}" EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
endif()
if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
message(STATUS "Standard libraries to link to explicitly: ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}")
else()
@@ -743,6 +757,57 @@ endif()
if(NOT WIN32 AND EIGEN_BUILD_SPBENCH)
add_subdirectory(bench/spbench EXCLUDE_FROM_ALL)
endif()
#--------------------------------------------------------------------------------------#
# AOCL BENCHMARK BUILD SECTION #
#--------------------------------------------------------------------------------------#
if(EIGEN_BUILD_AOCL_BENCH)
# Allow users to override the default architecture
set(EIGEN_AOCL_BENCH_ARCH "znver5" CACHE STRING "Target architecture for AOCL benchmark")
add_executable(benchmark_aocl EXCLUDE_FROM_ALL bench/benchmark_aocl.cpp)
include(CheckCXXCompilerFlag)
check_cxx_compiler_flag("-march=${EIGEN_AOCL_BENCH_ARCH}" COMPILER_SUPPORTS_AOCL_ARCH)
if(COMPILER_SUPPORTS_AOCL_ARCH)
target_compile_options(benchmark_aocl PRIVATE -O3 -Wno-shadow -march=${EIGEN_AOCL_BENCH_ARCH})
else()
message(WARNING "${EIGEN_AOCL_BENCH_ARCH} architecture not supported by compiler")
target_compile_options(benchmark_aocl PRIVATE -O3)
endif()
# Add custom flags if provided
if(EIGEN_AOCL_BENCH_FLAGS)
separate_arguments(CUSTOM_FLAGS NATIVE_COMMAND "${EIGEN_AOCL_BENCH_FLAGS}")
target_compile_options(benchmark_aocl PRIVATE ${CUSTOM_FLAGS})
# Check if OpenMP is requested in custom flags and link it
string(FIND "${EIGEN_AOCL_BENCH_FLAGS}" "-fopenmp" OPENMP_REQUESTED)
if(NOT OPENMP_REQUESTED EQUAL -1)
find_package(OpenMP)
if(OpenMP_CXX_FOUND)
target_link_libraries(benchmark_aocl OpenMP::OpenMP_CXX)
else()
# Generic fallback: let compiler handle OpenMP linking
if(MSVC)
target_compile_options(benchmark_aocl PRIVATE "/openmp")
else()
target_compile_options(benchmark_aocl PRIVATE "-fopenmp")
target_link_options(benchmark_aocl PRIVATE "-fopenmp")
endif()
message(STATUS "Using compiler OpenMP flags as fallback")
endif()
endif()
endif()
target_include_directories(benchmark_aocl PRIVATE ${INCLUDE_INSTALL_DIR})
if(EIGEN_AOCL_BENCH_USE_MT)
target_compile_definitions(benchmark_aocl PRIVATE EIGEN_USE_AOCL_MT)
else()
target_compile_definitions(benchmark_aocl PRIVATE EIGEN_USE_AOCL_ALL)
endif()
target_link_libraries(benchmark_aocl Eigen3::Eigen)
if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
target_link_libraries(benchmark_aocl ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
endif()
endif()
#----------------------------------------------------------------------------------------#
if (EIGEN_BUILD_DEMOS)
add_subdirectory(demos EXCLUDE_FROM_ALL)
@@ -792,6 +857,9 @@ if(PROJECT_IS_TOP_LEVEL)
if (EIGEN_BUILD_LAPACK)
message(STATUS "lapack | Build LAPACK subset library (not the same thing as Eigen)")
endif()
if(EIGEN_BUILD_AOCL_BENCH)
message(STATUS "benchmark_aocl | Build AOCL benchmark executable")
endif()
message(STATUS "------------+--------------------------------------------------------------")
message(STATUS "")
endif()
@@ -799,3 +867,4 @@ endif()
message(STATUS "")
message(STATUS "Configured Eigen ${EIGEN_VERSION_STRING}")
message(STATUS "")

View File

@@ -53,6 +53,8 @@
// this include file manages BLAS and MKL related macros
// and inclusion of their respective header files
#include "src/Core/util/MKL_support.h"
#include "src/Core/util/AOCL_Support.h" // ← ADD THIS
#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
#define EIGEN_HAS_GPU_FP16
@@ -463,6 +465,10 @@ using std::ptrdiff_t;
#include "src/Core/Assign_MKL.h"
#endif
#ifdef EIGEN_USE_AOCL_VML
#include "src/Core/Assign_AOCL.h"
#endif
#include "src/Core/GlobalFunctions.h"
// IWYU pragma: end_exports

View File

@@ -0,0 +1,301 @@
/*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*
* Assign_AOCL.h - AOCL Vectorized Math Dispatch Layer for Eigen
*
* Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
*
* Description:
* ------------
* This file implements a high-performance dispatch layer that automatically
* routes Eigen's element-wise mathematical operations to AMD Optimizing CPU
* Libraries (AOCL) Vector Math Library (VML) functions when beneficial for
* performance.
*
* The dispatch system uses C++ template specialization to intercept Eigen's
* assignment operations and redirect them to AOCL's VRDA functions, which
* provide optimized implementations for AMD Zen architectures.
*
* Key Features:
* -------------
* 1. Automatic Dispatch: Seamlessly routes supported operations to AOCL without
* requiring code changes in user applications
*
* 2. Performance Optimization: Uses AOCL VRDA functions optimized for Zen
* family processors with automatic SIMD instruction selection (AVX2, AVX-512)
*
* 3. Threshold-Based Activation: Only activates for vectors larger than
* EIGEN_AOCL_VML_THRESHOLD (default: 128 elements) to avoid overhead on
* small vectors
*
* 4. Precision-Specific Handling:
* - Double precision: AOCL VRDA vectorized functions
* - Single precision: Scalar fallback (preserves correctness)
*
* 5. Memory Layout Compatibility: Ensures direct memory access and compatible
* storage orders between source and destination for optimal performance
*
* Supported Operations:
* ---------------------
* UNARY OPERATIONS (vector → vector):
* - Transcendental: exp(), sin(), cos(), sqrt(), log(), log10(), log2()
*
* BINARY OPERATIONS (vector op vector → vector):
* - Arithmetic: +, *, pow()
*
* Template Specialization Mechanism:
* -----------------------------------
* The system works by specializing Eigen's Assignment template for:
* 1. CwiseUnaryOp with scalar_*_op functors (unary operations)
* 2. CwiseBinaryOp with scalar_*_op functors (binary operations)
* 3. Dense2Dense assignment context with AOCL-compatible traits
*
* Dispatch conditions (all must be true):
* - Source and destination have DirectAccessBit (contiguous memory)
* - Compatible storage orders (both row-major or both column-major)
* - Vector size ≥ EIGEN_AOCL_VML_THRESHOLD or Dynamic size
* - Supported data type (currently double precision for VRDA)
*
* Integration Example:
* --------------------
* // Standard Eigen code - no changes required
* VectorXd x = VectorXd::Random(10000);
* VectorXd y = VectorXd::Random(10000);
* VectorXd result;
*
* // These operations are automatically dispatched to AOCL:
* result = x.array().exp(); // → amd_vrda_exp()
* result = x.array().sin(); // → amd_vrda_sin()
* result = x.array() + y.array(); // → amd_vrda_add()
* result = x.array().pow(y.array()); // → amd_vrda_pow()
*
* Configuration:
* --------------
* Required preprocessor definitions:
* - EIGEN_USE_AOCL_ALL or EIGEN_USE_AOCL_MT: Enable AOCL integration
* - EIGEN_USE_AOCL_VML: Enable Vector Math Library dispatch
*
* Compilation Requirements:
* -------------------------
* Include paths:
* - AOCL headers: -I${AOCL_ROOT}/include
* - Eigen headers: -I/path/to/eigen
*
* Link libraries:
* - AOCL MathLib: -lamdlibm
* - Standard math: -lm
*
* Compiler flags:
* - Optimization: -O3 (required for inlining)
* - Architecture: -march=znver5 or -march=native
* - Vectorization: -mfma -mavx512f (if supported)
*
* Platform Support:
* ------------------
* - Primary: Linux x86_64 with AMD Zen family processors
* - Compilers: GCC 8+, Clang 10+, AOCC (recommended)
* - AOCL Version: 4.0+ (with VRDA support)
*
* Error Handling:
* ---------------
* - Graceful fallback to scalar operations for unsupported configurations
* - Compile-time detection of AOCL availability
* - Runtime size and alignment validation with eigen_assert()
*
* Developer:
* ----------
* Name: Sharad Saurabh Bhaskar
* Email: shbhaska@amd.com
* Organization: Advanced Micro Devices, Inc.
*/
#ifndef EIGEN_ASSIGN_AOCL_H
#define EIGEN_ASSIGN_AOCL_H
namespace Eigen {
namespace internal {
// Traits for unary operations.
template <typename Dst, typename Src> class aocl_assign_traits {
private:
enum {
DstHasDirectAccess = !!(Dst::Flags & DirectAccessBit),
SrcHasDirectAccess = !!(Src::Flags & DirectAccessBit),
StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Src::IsRowMajor)),
InnerSize = Dst::IsVectorAtCompileTime ? int(Dst::SizeAtCompileTime)
: (Dst::Flags & RowMajorBit) ? int(Dst::ColsAtCompileTime)
: int(Dst::RowsAtCompileTime),
LargeEnough =
(InnerSize == Dynamic) || (InnerSize >= EIGEN_AOCL_VML_THRESHOLD)
};
public:
enum {
EnableAoclVML = DstHasDirectAccess && SrcHasDirectAccess &&
StorageOrdersAgree && LargeEnough,
Traversal = LinearTraversal
};
};
// Traits for binary operations (e.g., add, pow).
template <typename Dst, typename Lhs, typename Rhs>
class aocl_assign_binary_traits {
private:
enum {
DstHasDirectAccess = !!(Dst::Flags & DirectAccessBit),
LhsHasDirectAccess = !!(Lhs::Flags & DirectAccessBit),
RhsHasDirectAccess = !!(Rhs::Flags & DirectAccessBit),
StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Lhs::IsRowMajor)) &&
(int(Dst::IsRowMajor) == int(Rhs::IsRowMajor)),
InnerSize = Dst::IsVectorAtCompileTime ? int(Dst::SizeAtCompileTime)
: (Dst::Flags & RowMajorBit) ? int(Dst::ColsAtCompileTime)
: int(Dst::RowsAtCompileTime),
LargeEnough =
(InnerSize == Dynamic) || (InnerSize >= EIGEN_AOCL_VML_THRESHOLD)
};
public:
enum {
EnableAoclVML = DstHasDirectAccess && LhsHasDirectAccess &&
RhsHasDirectAccess && StorageOrdersAgree && LargeEnough
};
};
// Unary operation dispatch for float (scalar fallback).
#define EIGEN_AOCL_VML_UNARY_CALL_FLOAT(EIGENOP) \
template <typename DstXprType, typename SrcXprNested> \
struct Assignment< \
DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<float>, SrcXprNested>, \
assign_op<float, float>, Dense2Dense, \
std::enable_if_t< \
aocl_assign_traits<DstXprType, SrcXprNested>::EnableAoclVML>> { \
typedef CwiseUnaryOp<scalar_##EIGENOP##_op<float>, SrcXprNested> \
SrcXprType; \
static void run(DstXprType &dst, const SrcXprType &src, \
const assign_op<float, float> &) { \
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
Eigen::Index n = dst.size(); \
if (n <= 0) \
return; \
const float *input = \
reinterpret_cast<const float *>(src.nestedExpression().data()); \
float *output = reinterpret_cast<float *>(dst.data()); \
for (Eigen::Index i = 0; i < n; ++i) { \
output[i] = std::EIGENOP(input[i]); \
} \
} \
};
// Unary operation dispatch for double (AOCL vectorized).
#define EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(EIGENOP, AOCLOP) \
template <typename DstXprType, typename SrcXprNested> \
struct Assignment< \
DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<double>, SrcXprNested>, \
assign_op<double, double>, Dense2Dense, \
std::enable_if_t< \
aocl_assign_traits<DstXprType, SrcXprNested>::EnableAoclVML>> { \
typedef CwiseUnaryOp<scalar_##EIGENOP##_op<double>, SrcXprNested> \
SrcXprType; \
static void run(DstXprType &dst, const SrcXprType &src, \
const assign_op<double, double> &) { \
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
Eigen::Index n = dst.size(); \
eigen_assert(n <= INT_MAX && "AOCL does not support arrays larger than INT_MAX"); \
if (n <= 0) \
return; \
const double *input = \
reinterpret_cast<const double *>(src.nestedExpression().data()); \
double *output = reinterpret_cast<double *>(dst.data()); \
int aocl_n = internal::convert_index<int>(n); \
AOCLOP(aocl_n, const_cast<double *>(input), output); \
} \
};
// Instantiate unary calls for float (scalar).
// EIGEN_AOCL_VML_UNARY_CALL_FLOAT(exp)
// Instantiate unary calls for double (AOCL vectorized).
EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(exp2, amd_vrda_exp2)
EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(exp, amd_vrda_exp)
EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(sin, amd_vrda_sin)
EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(cos, amd_vrda_cos)
EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(sqrt, amd_vrda_sqrt)
EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(cbrt, amd_vrda_cbrt)
EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(abs, amd_vrda_fabs)
EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(log, amd_vrda_log)
EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(log10, amd_vrda_log10)
EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(log2, amd_vrda_log2)
// Binary operation dispatch for float (scalar fallback).
#define EIGEN_AOCL_VML_BINARY_CALL_FLOAT(EIGENOP, STDFUNC) \
template <typename DstXprType, typename LhsXprNested, typename RhsXprNested> \
struct Assignment< \
DstXprType, \
CwiseBinaryOp<scalar_##EIGENOP##_op<float, float>, LhsXprNested, \
RhsXprNested>, \
assign_op<float, float>, Dense2Dense, \
std::enable_if_t<aocl_assign_binary_traits< \
DstXprType, LhsXprNested, RhsXprNested>::EnableAoclVML>> { \
typedef CwiseBinaryOp<scalar_##EIGENOP##_op<float, float>, LhsXprNested, \
RhsXprNested> \
SrcXprType; \
static void run(DstXprType &dst, const SrcXprType &src, \
const assign_op<float, float> &) { \
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
Eigen::Index n = dst.size(); \
if (n <= 0) \
return; \
const float *lhs = reinterpret_cast<const float *>(src.lhs().data()); \
const float *rhs = reinterpret_cast<const float *>(src.rhs().data()); \
float *output = reinterpret_cast<float *>(dst.data()); \
for (Eigen::Index i = 0; i < n; ++i) { \
output[i] = STDFUNC(lhs[i], rhs[i]); \
} \
} \
};
// Binary operation dispatch for double (AOCL vectorized).
#define EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(EIGENOP, AOCLOP) \
template <typename DstXprType, typename LhsXprNested, typename RhsXprNested> \
struct Assignment< \
DstXprType, \
CwiseBinaryOp<scalar_##EIGENOP##_op<double, double>, LhsXprNested, \
RhsXprNested>, \
assign_op<double, double>, Dense2Dense, \
std::enable_if_t<aocl_assign_binary_traits< \
DstXprType, LhsXprNested, RhsXprNested>::EnableAoclVML>> { \
typedef CwiseBinaryOp<scalar_##EIGENOP##_op<double, double>, LhsXprNested, \
RhsXprNested> \
SrcXprType; \
static void run(DstXprType &dst, const SrcXprType &src, \
const assign_op<double, double> &) { \
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
Eigen::Index n = dst.size(); \
eigen_assert(n <= INT_MAX && "AOCL does not support arrays larger than INT_MAX"); \
if (n <= 0) \
return; \
const double *lhs = reinterpret_cast<const double *>(src.lhs().data()); \
const double *rhs = reinterpret_cast<const double *>(src.rhs().data()); \
double *output = reinterpret_cast<double *>(dst.data()); \
int aocl_n = internal::convert_index<int>(n); \
AOCLOP(aocl_n, const_cast<double *>(lhs), const_cast<double *>(rhs), output); \
} \
};
// Instantiate binary calls for float (scalar).
// EIGEN_AOCL_VML_BINARY_CALL_FLOAT(sum, std::plus<float>) // Using
// scalar_sum_op for addition EIGEN_AOCL_VML_BINARY_CALL_FLOAT(pow, std::pow)
// Instantiate binary calls for double (AOCL vectorized).
EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(sum, amd_vrda_add) // Using scalar_sum_op for addition
EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(pow, amd_vrda_pow)
EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(max, amd_vrda_fmax)
EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(min, amd_vrda_fmin)
} // namespace internal
} // namespace Eigen
#endif // EIGEN_ASSIGN_AOCL_H

View File

@@ -0,0 +1,175 @@
/*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*
* AOCL_Support.h - AMD Optimizing CPU Libraries Integration Header for Eigen
*
* Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
*
* Description:
* ------------
* This header file serves as the central configuration and integration point
* for AMD Optimizing CPU Libraries (AOCL) with the Eigen C++ template library.
* It orchestrates the integration of multiple AOCL components to provide
* optimal mathematical computing performance on AMD Zen family processors.
*
* AOCL Component Integration:
* ---------------------------
* 1. AOCL Vector Math Library (VML):
* - Provides VRDA (Vector Rapid Double-precision Arithmetic) functions
* - Optimized transcendental functions: exp, sin, cos, sqrt, log, pow, etc.
* - SIMD vectorization for AMD architectures (AVX2, AVX-512)
* - Headers: amdlibm.h, amdlibm_vec.h
*
* 2. AOCL BLAS (BLIS - BLAS-like Library Instantiation Software):
* - High-performance Basic Linear Algebra Subprograms
* - Supports single-threaded (libblis) and multithreaded (libblis-mt)
* variants
* - Optimized matrix operations: GEMM, GEMV, TRSM, etc.
* - Headers: cblas.h, blis.h
*
* 3. AOCL LAPACK (libFLAME - Formal Linear Algebra Methods Environment):
* - Dense linear algebra operations: factorizations, eigenvalue solvers
* - Matrix decompositions: LU, Cholesky, QR, SVD
* - Eigenvalue/eigenvector computations optimized for AMD hardware
* - Headers: LAPACKE interface
*
* ------------------------------
* EIGEN_AOCL_VML_THRESHOLD (default: 128):
* - Minimum vector size for AOCL VML dispatch
* - Smaller vectors use standard Eigen to avoid function call overhead
* - Optimal values: 64-512 depending on operation and data characteristics
*
*
*
* Architecture Support:
* ---------------------
* Optimized for AMD processor families:
* - Zen Architecture (Naples, Rome): AVX2 optimization
* - Zen 2 Architecture (Rome, Matisse): Enhanced AVX2
* - Zen 3 Architecture (Milan, Vermeer): Improved IPC and cache
* - Zen 4 Architecture (Genoa, Raphael): AVX-512 support
* - Zen 5 Architecture (Turin, Granite Ridge): Enhanced AVX-512
*
*
* Dependencies:
* -------------
* Required AOCL components:
* - libamdlibm: Core math library with VRDA functions
* - libblis or libblis-mt: BLAS implementation
* - libflame: LAPACK implementation
*
* System requirements:
* - AMD x86_64 processor (optimal performance)
* - Linux, Windows, or compatible POSIX system
* - C++11 or later standard
* - CMake 3.5+ for build system integration
*
* Developer:
* ----------
* Name: Sharad Saurabh Bhaskar
* Email: shbhaska@amd.com
* Organization: Advanced Micro Devices, Inc.
*/
#ifndef EIGEN_AOCL_SUPPORT_H
#define EIGEN_AOCL_SUPPORT_H
#if defined(EIGEN_USE_AOCL_ALL) || defined(EIGEN_USE_AOCL_MT)
#include <complex>
// Define AOCL component flags based on main flags
#ifdef EIGEN_USE_AOCL_ALL
#define EIGEN_USE_AOCL_VML // Enable AOCL Vector Math Library
#define EIGEN_USE_AOCL_BLAS // Enable AOCL BLAS (BLIS)
// Enable Eigen BLAS backend only if BLIS provides compatible interface
#if defined(EIGEN_AOCL_BLIS_COMPATIBLE)
#define EIGEN_USE_BLAS // Enable Eigen BLAS backend
#endif
#define EIGEN_USE_LAPACKE // Enable LAPACK backend (FLAME)
#endif
#ifdef EIGEN_USE_AOCL_MT
#define EIGEN_USE_AOCL_VML // Enable AOCL Vector Math Library
#define EIGEN_USE_AOCL_BLAS // Enable AOCL BLAS (BLIS)
// For multithreaded: disable EIGEN_USE_BLAS to avoid signature conflicts
// Use direct BLIS calls instead through EIGEN_USE_AOCL_BLAS
// #define EIGEN_USE_BLAS // Commented out - causes conflicts with BLIS
// interface
// Note: LAPACKE disabled in MT mode to avoid header conflicts
#define EIGEN_USE_LAPACKE // Commented out - causes conflicts with BLIS LAPACKE
#define EIGEN_AOCL_USE_BLIS_MT 1 // Enable multithreaded BLIS
#endif
// Handle standalone EIGEN_USE_AOCL_VML flag
#ifndef EIGEN_USE_AOCL_VML
#ifdef EIGEN_USE_AOCL_ALL
#define EIGEN_USE_AOCL_VML
#endif
#ifdef EIGEN_USE_AOCL_MT
#define EIGEN_USE_AOCL_VML
#endif
#endif
// Configuration constants - define these for any AOCL usage
#ifndef EIGEN_AOCL_VML_THRESHOLD
#define EIGEN_AOCL_VML_THRESHOLD 128 // Threshold for VML dispatch
#endif
#ifndef AOCL_SIMD_WIDTH
#define AOCL_SIMD_WIDTH 8 // AVX-512: 512 bits / 64 bits per double
#endif
// Include AOCL Math Library headers for VML
#if defined(EIGEN_USE_AOCL_VML) || defined(EIGEN_USE_AOCL_ALL) || \
defined(EIGEN_USE_AOCL_MT)
#if defined(__has_include)
#if __has_include("amdlibm.h")
#include "amdlibm.h"
#ifndef AMD_LIBM_VEC_EXPERIMENTAL
#define AMD_LIBM_VEC_EXPERIMENTAL
#endif
#if __has_include("amdlibm_vec.h")
#include "amdlibm_vec.h"
#endif
#endif
#else
// Fallback for compilers without __has_include
#include "amdlibm.h"
#ifndef AMD_LIBM_VEC_EXPERIMENTAL
#define AMD_LIBM_VEC_EXPERIMENTAL
#endif
#include "amdlibm_vec.h"
#endif
#endif
// Include CBLAS headers when BLAS is enabled
#ifdef EIGEN_USE_AOCL_BLAS
#if defined(__has_include)
#if __has_include("cblas.h")
#include "cblas.h"
#elif __has_include("blis.h")
#include "blis.h"
#endif
#else
// Fallback
#include "cblas.h"
#endif
#endif
namespace Eigen {
// AOCL-specific type definitions
typedef std::complex<double> dcomplex;
typedef std::complex<float> scomplex;
typedef int BlasIndex; // Standard BLAS index type
} // namespace Eigen
#endif // EIGEN_USE_AOCL_ALL || EIGEN_USE_AOCL_MT
#endif // EIGEN_AOCL_SUPPORT_H

View File

@@ -53,3 +53,56 @@ $ ./bench_multi_compilers.sh ompbench.cxxlist ompbenchmark.cpp
************************
* benchmark_aocl *
************************
This benchmark exercises Eigen operations using AMD Optimized Libraries
(AOCL). It is disabled by default and can be enabled when configuring the
build:
cmake .. -DEIGEN_BUILD_AOCL_BENCH=ON
The resulting `benchmark_aocl` target is compiled with `-O3` and, if the
compiler supports it, `-march=znver5` for optimal performance on AMD
processors.
The benchmark also links to `libblis-mt.so` and `libflame.so` so BLAS and
LAPACK operations run with multithreaded AOCL when available.
By default the CMake build defines `EIGEN_USE_AOCL_MT` via the option
`EIGEN_AOCL_BENCH_USE_MT` (enabled). Set this option to `OFF` if you want
to build the benchmark using the single-threaded AOCL libraries instead,
in which case `EIGEN_USE_AOCL_ALL` is defined.
Alternatively you can build the same benchmark using the
`Makefile` in this directory. This allows experimenting with
different compiler flags without reconfiguring CMake:
```
cd bench && make # builds with -O3 -march=znver5 by default
make clean && make CXX="clang++" ## For differnt compiler apart from g++
make clean && make MARCH="" CXXFLAGS="-O2" # example of custom flags
make AOCL_ROOT=/opt/aocl # use AOCL from a custom location
This Makefile links against `libblis-mt.so` and `libflame.so` so the
matrix multiplication benchmark exercises multithreaded BLIS when
`EIGEN_USE_AOCL_MT` is defined (enabled by default in the Makefile).
If you prefer to compile manually, ensure that the Eigen include path
points to the directory where `AOCL_Support.h` resides. For example:
clang++ -O3 -std=c++14 -I../build/install/include \
-march=znver5 -DEIGEN_USE_AOCL_MT \
benchmark_aocl.cpp -o benchmark_aocl \
-lblis-mt -lflame -lamdlibm -lpthread -lm
```
Replace `../install/include` with your actual Eigen install path.
When invoking `make`, you can point `AOCL_ROOT` to your AOCL
installation directory so the Makefile links against `$(AOCL_ROOT)/lib`.

362
bench/benchmark_aocl.cpp Normal file
View File

@@ -0,0 +1,362 @@
/*
* benchmark_aocl.cpp - AOCL Performance Benchmark Suite for Eigen
*
* Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*
* Description:
* ------------
* This benchmark suite evaluates the performance of Eigen mathematical
* operations when integrated with AMD Optimizing CPU Libraries (AOCL). It
* tests:
*
* 1. Vector Math Operations: Transcendental functions (exp, sin, cos, sqrt,
* log, etc.) using AOCL Vector Math Library (VML) for optimized
* double-precision operations
*
* 2. Matrix Operations: BLAS Level-3 operations (DGEMM) using AOCL BLAS library
* with support for both single-threaded and multithreaded execution
*
* 3. Linear Algebra: LAPACK operations (eigenvalue decomposition) using
* libflame
*
* 4. Real-world Scenarios: Financial risk computation simulating covariance
* matrix calculations and eigenvalue analysis for portfolio optimization
*
* The benchmark automatically detects AOCL configuration and adjusts test
* execution accordingly, providing performance comparisons between standard
* Eigen operations and AOCL-accelerated implementations.
*
* Compilation:
* ------------
* # Using AOCC compiler (recommended for best AOCL compatibility):
* clang++ -O3 -g -DEIGEN_USE_AOCL_ALL -I<PATH_TO_EIGEN_INCLUDE>
* -I${AOCL_ROOT}/include \
* -Wno-parentheses src/benchmark_aocl.cpp -L${AOCL_ROOT}/lib \
* -lamdlibm -lm -lblis -lflame -lpthread -lrt -pthread \
* -o build/eigen_aocl_benchmark
*
* # Alternative: Using GCC with proper library paths:
* g++ -O3 -g -DEIGEN_USE_AOCL_ALL -I<PATH_TO_EIGEN_INCLUDE>
* -I${AOCL_ROOT}/include \
* -Wno-parentheses src/benchmark_aocl.cpp -L${AOCL_ROOT}/lib \
* -lamdlibm -lm -lblis -lflame -lpthread -lrt \
* -o build/eigen_aocl_benchmark
*
* # For multithreaded BLIS support:
* clang++ -O3 -g -fopenmp -DEIGEN_USE_AOCL_MT -I<PATH_TO_EIGEN_INCLUDE> \
* -I${AOCL_ROOT}/include -Wno-parentheses src/benchmark_aocl.cpp \
* -L${AOCL_ROOT}/lib -lamdlibm -lm -lblis-mt -lflame -lpthread -lrt \
* -o build/eigen_aocl_benchmark_mt
*
* Usage:
* ------
* export AOCL_ROOT=/path/to/aocl/installation
* export LD_LIBRARY_PATH=$AOCL_ROOT/lib:$LD_LIBRARY_PATH
* ./build/eigen_aocl_benchmark
*
* Developer:
* ----------
* Name: Sharad Saurabh Bhaskar
* Email: shbhaska@amd.com
* Organization: Advanced Micro Devices, Inc.
*/
#include <chrono>
#include <cstdlib>
#include <iostream>
#include <thread>
#include <vector>
// Simple - just include Eigen headers
#include <Eigen/Core>
#include <Eigen/Dense>
#include <Eigen/Eigenvalues>
// Only include CBLAS if AOCL BLIS is available
#ifdef EIGEN_USE_AOCL_ALL
#include <cblas.h>
#endif
using namespace std;
using namespace std::chrono;
using namespace Eigen;
void benchmarkVectorMath(int size) {
VectorXd v = VectorXd::LinSpaced(size, 0.1, 10.0);
VectorXd result(size);
double elapsed_ms = 0;
cout << "\n--- Vector Math Benchmark (size = " << size << ") ---" << endl;
auto start = high_resolution_clock::now();
result = v.array().exp();
auto end = high_resolution_clock::now();
elapsed_ms = duration_cast<milliseconds>(end - start).count();
cout << "exp() time: " << elapsed_ms << " ms" << endl;
start = high_resolution_clock::now();
result = v.array().sin();
end = high_resolution_clock::now();
elapsed_ms = duration_cast<milliseconds>(end - start).count();
cout << "sin() time: " << elapsed_ms << " ms" << endl;
start = high_resolution_clock::now();
result = v.array().cos();
end = high_resolution_clock::now();
elapsed_ms = duration_cast<milliseconds>(end - start).count();
cout << "cos() time: " << elapsed_ms << " ms" << endl;
start = high_resolution_clock::now();
result = v.array().sqrt();
end = high_resolution_clock::now();
elapsed_ms = duration_cast<milliseconds>(end - start).count();
cout << "sqrt() time: " << elapsed_ms << " ms" << endl;
start = high_resolution_clock::now();
result = v.array().cbrt();
end = high_resolution_clock::now();
elapsed_ms = duration_cast<milliseconds>(end - start).count();
cout << "cbrt() time: " << elapsed_ms << " ms" << endl;
start = high_resolution_clock::now();
result = v.array().abs();
end = high_resolution_clock::now();
elapsed_ms = duration_cast<milliseconds>(end - start).count();
cout << "abs() time: " << elapsed_ms << " ms" << endl;
start = high_resolution_clock::now();
result = v.array().log();
end = high_resolution_clock::now();
elapsed_ms = duration_cast<milliseconds>(end - start).count();
cout << "log() time: " << elapsed_ms << " ms" << endl;
start = high_resolution_clock::now();
result = v.array().log10();
end = high_resolution_clock::now();
elapsed_ms = duration_cast<milliseconds>(end - start).count();
cout << "log10() time: " << elapsed_ms << " ms" << endl;
start = high_resolution_clock::now();
result = v.array().exp2();
end = high_resolution_clock::now();
elapsed_ms = duration_cast<milliseconds>(end - start).count();
cout << "exp2() time: " << elapsed_ms << " ms" << endl;
start = high_resolution_clock::now();
result = v.array().asin();
end = high_resolution_clock::now();
elapsed_ms = duration_cast<milliseconds>(end - start).count();
cout << "asin() time: " << elapsed_ms << " ms" << endl;
start = high_resolution_clock::now();
result = v.array().sinh();
end = high_resolution_clock::now();
elapsed_ms = duration_cast<milliseconds>(end - start).count();
cout << "sinh() time: " << elapsed_ms << " ms" << endl;
start = high_resolution_clock::now();
result = v.array().acos();
end = high_resolution_clock::now();
elapsed_ms = duration_cast<milliseconds>(end - start).count();
cout << "acos() time: " << elapsed_ms << " ms" << endl;
start = high_resolution_clock::now();
result = v.array().cosh();
end = high_resolution_clock::now();
elapsed_ms = duration_cast<milliseconds>(end - start).count();
cout << "cosh() time: " << elapsed_ms << " ms" << endl;
start = high_resolution_clock::now();
result = v.array().tan();
end = high_resolution_clock::now();
elapsed_ms = duration_cast<milliseconds>(end - start).count();
cout << "tan() time: " << elapsed_ms << " ms" << endl;
start = high_resolution_clock::now();
result = v.array().atan();
end = high_resolution_clock::now();
elapsed_ms = duration_cast<milliseconds>(end - start).count();
cout << "atan() time: " << elapsed_ms << " ms" << endl;
start = high_resolution_clock::now();
result = v.array().tanh();
end = high_resolution_clock::now();
elapsed_ms = duration_cast<milliseconds>(end - start).count();
cout << "tanh() time: " << elapsed_ms << " ms" << endl;
VectorXd v2 = VectorXd::Random(size);
start = high_resolution_clock::now();
result = v.array() + v2.array();
end = high_resolution_clock::now();
elapsed_ms = duration_cast<milliseconds>(end - start).count();
cout << "add() time: " << elapsed_ms << " ms" << endl;
start = high_resolution_clock::now();
result = v.array().pow(2.0);
end = high_resolution_clock::now();
elapsed_ms = duration_cast<milliseconds>(end - start).count();
cout << "pow() time: " << elapsed_ms << " ms" << endl;
start = high_resolution_clock::now();
result = v.array().max(v2.array());
end = high_resolution_clock::now();
elapsed_ms = duration_cast<milliseconds>(end - start).count();
cout << "max() time: " << elapsed_ms << " ms" << endl;
start = high_resolution_clock::now();
result = v.array().min(v2.array());
end = high_resolution_clock::now();
elapsed_ms = duration_cast<milliseconds>(end - start).count();
cout << "min() time: " << elapsed_ms << " ms" << endl;
}
// Function to benchmark BLAS operation: Matrix multiplication.
void benchmarkMatrixMultiplication(int matSize) {
cout << "\n--- BLIS-st DGEMM Benchmark (" << matSize << " x " << matSize
<< ") ---" << endl;
MatrixXd A = MatrixXd::Random(matSize, matSize);
MatrixXd B = MatrixXd::Random(matSize, matSize);
MatrixXd C(matSize, matSize);
auto start = high_resolution_clock::now();
C = A * B;
auto end = high_resolution_clock::now();
double elapsed_ms = duration_cast<milliseconds>(end - start).count();
cout << "Matrix multiplication time: " << elapsed_ms << " ms" << endl;
}
// Benchmark BLIS directly using its CBLAS interface if available.
void benchmarkBlisMultithreaded(int matSize, int numThreads) {
#if defined(EIGEN_AOCL_USE_BLIS_MT)
cout << "\n--- BLIS-mt DGEMM Benchmark (" << matSize << " x " << matSize
<< ", threads=" << numThreads << ") ---" << endl;
vector<double> A(matSize * matSize);
vector<double> B(matSize * matSize);
vector<double> C(matSize * matSize);
for (auto &v : A)
v = static_cast<double>(rand()) / RAND_MAX;
for (auto &v : B)
v = static_cast<double>(rand()) / RAND_MAX;
double alpha = 1.0, beta = 0.0;
string th = to_string(numThreads);
setenv("BLIS_NUM_THREADS", th.c_str(), 1);
auto start = high_resolution_clock::now();
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, matSize, matSize,
matSize, alpha, A.data(), matSize, B.data(), matSize, beta,
C.data(), matSize);
auto end = high_resolution_clock::now();
double elapsed_ms = duration_cast<milliseconds>(end - start).count();
cout << "BLIS dgemm time: " << elapsed_ms << " ms" << endl;
#else
(void)matSize;
(void)numThreads;
cout << "\nBLIS multithreaded support not enabled." << endl;
#endif
}
// Function to benchmark LAPACK operation: Eigenvalue decomposition.
void benchmarkEigenDecomposition(int matSize) {
cout << "\n--- Eigenvalue Decomposition Benchmark (Matrix Size: " << matSize
<< " x " << matSize << ") ---" << endl;
MatrixXd M = MatrixXd::Random(matSize, matSize);
// Make matrix symmetric (necessary for eigenvalue decomposition of
// self-adjoint matrices)
M = (M + M.transpose()) * 0.5;
SelfAdjointEigenSolver<MatrixXd> eigensolver;
auto start = high_resolution_clock::now();
eigensolver.compute(M);
auto end = high_resolution_clock::now();
double elapsed_ms = duration_cast<milliseconds>(end - start).count();
if (eigensolver.info() == Success) {
cout << "Eigenvalue decomposition time: " << elapsed_ms << " ms" << endl;
} else {
cout << "Eigenvalue decomposition failed." << endl;
}
}
// Function simulating a real-world FSI risk computation scenario.
// Example: Compute covariance matrix from simulated asset returns, then perform
// eigenvalue decomposition.
void benchmarkFSIRiskComputation(int numPeriods, int numAssets) {
cout << "\n--- FSI Risk Computation Benchmark ---" << endl;
cout << "Simulating " << numPeriods << " periods for " << numAssets
<< " assets." << endl;
// Simulate asset returns: each column represents an asset's returns.
MatrixXd returns = MatrixXd::Random(numPeriods, numAssets);
// Compute covariance matrix: cov = (returns^T * returns) / (numPeriods - 1)
auto start = high_resolution_clock::now();
MatrixXd cov = (returns.transpose() * returns) / (numPeriods - 1);
auto end = high_resolution_clock::now();
double cov_time = duration_cast<milliseconds>(end - start).count();
cout << "Covariance matrix computation time: " << cov_time << " ms" << endl;
// Eigenvalue decomposition on covariance matrix.
SelfAdjointEigenSolver<MatrixXd> eigensolver;
start = high_resolution_clock::now();
eigensolver.compute(cov);
end = high_resolution_clock::now();
double eig_time = duration_cast<milliseconds>(end - start).count();
if (eigensolver.info() == Success) {
cout << "Eigenvalue decomposition (covariance) time: " << eig_time << " ms"
<< endl;
cout << "Top 3 Eigenvalues: "
<< eigensolver.eigenvalues().tail(3).transpose() << endl;
} else {
cout << "Eigenvalue decomposition failed." << endl;
}
}
int main() {
cout << "=== AOCL Benchmark for Eigen on AMD Platforms ===" << endl;
cout << "Developer: Sharad Saurabh Bhaskar (shbhaska@amd.com)" << endl;
cout << "Organization: Advanced Micro Devices, Inc." << endl;
cout << "License: Mozilla Public License 2.0" << endl << endl;
// Print AOCL configuration
#ifdef EIGEN_USE_AOCL_MT
cout << "AOCL Mode: MULTITHREADED (MT)" << endl;
cout << "Features: Multithreaded BLIS, AOCL VML, LAPACK" << endl;
#elif defined(EIGEN_USE_AOCL_ALL)
cout << "AOCL Mode: SINGLE-THREADED (ALL)" << endl;
cout << "Features: Single-threaded BLIS, AOCL VML, LAPACK" << endl;
#else
cout << "AOCL Mode: DISABLED" << endl;
cout << "Using standard Eigen implementation" << endl;
#endif
cout << "Hardware threads available: " << thread::hardware_concurrency() << endl << endl;
// Benchmark vector math functions with varying vector sizes.
vector<int> vectorSizes = {5000000, 10000000, 50000000};
for (int size : vectorSizes) {
benchmarkVectorMath(size);
}
// Benchmark matrix multiplication for varying sizes.
vector<int> matrixSizes = {1024};
for (int msize : matrixSizes) {
benchmarkMatrixMultiplication(msize);
#if defined(EIGEN_AOCL_USE_BLIS_MT)
benchmarkBlisMultithreaded(msize, thread::hardware_concurrency());
#endif
}
// Benchmark LAPACK: Eigenvalue Decomposition.
for (int msize : matrixSizes) {
benchmarkEigenDecomposition(msize);
}
// Benchmark a complex FSI risk computation scenario.
// For example, simulate 10,000 time periods (days) for 500 assets.
benchmarkFSIRiskComputation(10000, 500);
cout << "\n=== Benchmark Complete ===" << endl;
return 0;
}

264
cmake/FindAOCL.cmake Normal file
View File

@@ -0,0 +1,264 @@
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
#
# FindAOCL.cmake - CMake Module for AMD Optimizing CPU Libraries (AOCL)
#
# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
#
# Description:
# ------------
# This CMake module locates and configures AMD Optimizing CPU Libraries (AOCL)
# for high-performance mathematical computing on AMD processors. It searches for
# and sets up the following AOCL components:
#
# 1. AOCL MathLib (libamdlibm): Vector Math Library providing optimized
# transcendental functions (exp, sin, cos, sqrt, log, etc.) with VRDA
# (Vector Rapid Double-precision Arithmetic) support for SIMD acceleration
#
# 2. AOCL BLAS (BLIS): Basic Linear Algebra Subprograms optimized for AMD
# architectures, supporting both single-threaded (libblis) and multithreaded
# (libblis-mt) execution with OpenMP parallelization
#
# 3. AOCL LAPACK (libflame): Linear Algebra PACKage providing dense matrix
# factorizations, eigenvalue/eigenvector computations, and linear system
# solvers optimized for AMD processors
#
# The module automatically detects the appropriate library variants based on
# configuration flags and provides proper linking setup for optimal performance
# on Zen, Zen2, Zen3, Zen4, and Zen5 architectures.
#
# Variables Set:
# --------------
# AOCL_FOUND - True if AOCL libraries are found
# AOCL_LIBRARIES - List of AOCL libraries to link against
# AOCL_INCLUDE_DIRS - Include directories for AOCL headers
# AOCL_BLAS_TYPE - Type of BLIS library found ("multithreaded" or "single-threaded")
# AOCL_CORE_LIB - Path to core AOCL math library
# AOCL_BLAS_LIB - Path to AOCL BLAS library
# AOCL_LAPACK_LIB - Path to AOCL LAPACK library
#
# Configuration Options:
# ----------------------
# EIGEN_AOCL_BENCH_USE_MT - When ON, searches for multithreaded BLIS first
# When OFF, searches for single-threaded BLIS only
#
# # For multithreaded BLIS:
# cmake .. -DEIGEN_AOCL_BENCH_USE_MT=ON
#
# # For single-threaded BLIS:
# cmake .. -DEIGEN_AOCL_BENCH_USE_MT=OFF
#
# Library Search Paths:
# ---------------------
# The module searches for AOCL libraries in the following order:
# 1. ${AOCL_ROOT}/lib (or ${AOCL_ROOT}/lib32 for 32-bit)
# 2. /opt/amd/aocl/lib64 (or /opt/amd/aocl/lib32 for 32-bit)
# 3. ${LIB_INSTALL_DIR}
#
# Expected Library Names:
# -----------------------
# Core MathLib: amdlibm, alm, almfast
# BLAS Single: blis
# BLAS Multi: blis-mt
# LAPACK: flame
#
# Dependencies:
# -------------
# The module automatically links the following system libraries:
# - libm (standard math library)
# - libpthread (POSIX threads)
# - librt (real-time extensions)
#
# Architecture Support:
# ---------------------
# Optimized for AMD Zen family processors (Zen, Zen2, Zen3, Zen4, Zen5)
# with automatic architecture detection and SIMD instruction selection.
#
# Developer:
# ----------
# Name: Sharad Saurabh Bhaskar
# Email: shbhaska@amd.com
#
if(NOT DEFINED AOCL_ROOT)
if(DEFINED ENV{AOCL_ROOT})
set(AOCL_ROOT $ENV{AOCL_ROOT})
message(STATUS "AOCL_ROOT set from environment: ${AOCL_ROOT}")
else()
message(WARNING "AOCL_ROOT is not set. AOCL support will be disabled.")
set(AOCL_LIBRARIES "")
endif()
endif()
if(AOCL_LIBRARIES)
set(AOCL_FIND_QUIETLY TRUE)
endif()
# Determine default include directories
set(AOCL_INCLUDE_DIRS "")
if(AOCL_ROOT AND EXISTS "${AOCL_ROOT}/include")
list(APPEND AOCL_INCLUDE_DIRS "${AOCL_ROOT}/include")
endif()
if(EXISTS "/opt/amd/aocl/include")
list(APPEND AOCL_INCLUDE_DIRS "/opt/amd/aocl/include")
endif()
if(${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "x86_64")
# Search for the core AOCL math library.
find_library(AOCL_CORE_LIB
NAMES amdlibm alm almfast
PATHS
${AOCL_ROOT}/lib
/opt/amd/aocl/lib64
${LIB_INSTALL_DIR}
)
if(AOCL_CORE_LIB)
message(STATUS "Found AOCL core library: ${AOCL_CORE_LIB}")
else()
message(WARNING "AOCL core library not found in ${AOCL_ROOT}/lib or default locations.")
endif()
# Conditional BLIS library search based on MT requirement
if(EIGEN_AOCL_BENCH_USE_MT)
# Search for multithreaded BLIS first
find_library(AOCL_BLAS_LIB
NAMES blis-mt
PATHS
${AOCL_ROOT}/lib
/opt/amd/aocl/lib64
${LIB_INSTALL_DIR}
)
if(AOCL_BLAS_LIB)
message(STATUS "Found AOCL BLAS (MT) library: ${AOCL_BLAS_LIB}")
set(AOCL_BLAS_TYPE "multithreaded")
else()
message(WARNING "AOCL multithreaded BLAS library not found, falling back to single-threaded.")
find_library(AOCL_BLAS_LIB
NAMES blis
PATHS
${AOCL_ROOT}/lib
/opt/amd/aocl/lib64
${LIB_INSTALL_DIR}
)
set(AOCL_BLAS_TYPE "single-threaded")
endif()
else()
# Search for single-threaded BLIS
find_library(AOCL_BLAS_LIB
NAMES blis
PATHS
${AOCL_ROOT}/lib
/opt/amd/aocl/lib64
${LIB_INSTALL_DIR}
)
if(AOCL_BLAS_LIB)
message(STATUS "Found AOCL BLAS (ST) library: ${AOCL_BLAS_LIB}")
set(AOCL_BLAS_TYPE "single-threaded")
else()
message(WARNING "AOCL single-threaded BLAS library not found.")
endif()
endif()
# Now search for AOCL LAPACK library.
find_library(AOCL_LAPACK_LIB
NAMES flame
PATHS
${AOCL_ROOT}/lib
/opt/amd/aocl/lib64
${LIB_INSTALL_DIR}
)
if(AOCL_LAPACK_LIB)
message(STATUS "Found AOCL LAPACK library: ${AOCL_LAPACK_LIB}")
else()
message(WARNING "AOCL LAPACK library not found in ${AOCL_ROOT}/lib or default locations.")
endif()
else()
# For 32-bit systems, similar search paths.
find_library(AOCL_CORE_LIB
NAMES amdlibm alm almfast
PATHS
${AOCL_ROOT}/lib
/opt/amd/aocl/lib32
${LIB_INSTALL_DIR}
)
if(AOCL_CORE_LIB)
message(STATUS "Found AOCL core library: ${AOCL_CORE_LIB}")
else()
message(WARNING "AOCL core library not found in ${AOCL_ROOT}/lib or default locations.")
endif()
# Conditional BLIS library search for 32-bit
if(EIGEN_AOCL_BENCH_USE_MT)
find_library(AOCL_BLAS_LIB
NAMES blis-mt
PATHS
${AOCL_ROOT}/lib
/opt/amd/aocl/lib32
${LIB_INSTALL_DIR}
)
if(AOCL_BLAS_LIB)
message(STATUS "Found AOCL BLAS (MT) library: ${AOCL_BLAS_LIB}")
set(AOCL_BLAS_TYPE "multithreaded")
else()
message(WARNING "AOCL multithreaded BLAS library not found, falling back to single-threaded.")
find_library(AOCL_BLAS_LIB
NAMES blis
PATHS
${AOCL_ROOT}/lib
/opt/amd/aocl/lib32
${LIB_INSTALL_DIR}
)
set(AOCL_BLAS_TYPE "single-threaded")
endif()
else()
find_library(AOCL_BLAS_LIB
NAMES blis
PATHS
${AOCL_ROOT}/lib
/opt/amd/aocl/lib32
${LIB_INSTALL_DIR}
)
if(AOCL_BLAS_LIB)
message(STATUS "Found AOCL BLAS (ST) library: ${AOCL_BLAS_LIB}")
set(AOCL_BLAS_TYPE "single-threaded")
else()
message(WARNING "AOCL single-threaded BLAS library not found.")
endif()
endif()
find_library(AOCL_LAPACK_LIB
NAMES flame
PATHS
${AOCL_ROOT}/lib
/opt/amd/aocl/lib32
${LIB_INSTALL_DIR}
)
if(AOCL_LAPACK_LIB)
message(STATUS "Found AOCL LAPACK library: ${AOCL_LAPACK_LIB}")
else()
message(WARNING "AOCL LAPACK library not found in ${AOCL_ROOT}/lib or default locations.")
endif()
endif()
# Combine the found libraries into one variable.
if(AOCL_CORE_LIB)
set(AOCL_LIBRARIES ${AOCL_CORE_LIB})
endif()
if(AOCL_BLAS_LIB)
list(APPEND AOCL_LIBRARIES ${AOCL_BLAS_LIB})
endif()
if(AOCL_LAPACK_LIB)
list(APPEND AOCL_LIBRARIES ${AOCL_LAPACK_LIB})
endif()
if(AOCL_LIBRARIES)
# Link against the standard math and pthread libraries as well as librt
list(APPEND AOCL_LIBRARIES m pthread rt)
endif()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(AOCL DEFAULT_MSG AOCL_LIBRARIES AOCL_INCLUDE_DIRS)
mark_as_advanced(AOCL_LIBRARIES AOCL_INCLUDE_DIRS)

289
doc/UsingAOCL.dox Normal file
View File

@@ -0,0 +1,289 @@
/*
Copyright (c) 2025, AMD Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of AMD nor the names of its contributors may
be used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
********************************************************************************
* Content : Documentation on the use of AMD AOCL through Eigen
********************************************************************************
*/
namespace Eigen {
/** \page TopicUsingAOCL Using AMD® AOCL from %Eigen
Since %Eigen version 3.4 and later, users can benefit from built-in AMD® Optimizing CPU Libraries (AOCL) optimizations with an installed copy of AOCL 5.0 (or later).
<a href="https://www.amd.com/en/developer/aocl.html"> AMD AOCL </a> provides highly optimized, multi-threaded mathematical routines for x86-64 processors with a focus on AMD "Zen"-based architectures. AOCL is available on Linux and Windows for x86-64 architectures.
\note
AMD® AOCL is freely available software, but it is the responsibility of users to download, install, and ensure their product's license allows linking to the AOCL libraries. AOCL is distributed under a permissive license that allows commercial use.
Using AMD AOCL through %Eigen is straightforward:
-# export \c AOCL_ROOT into your environment
-# define one of the AOCL macros before including any %Eigen headers (see table below)
-# link your program to AOCL libraries (BLIS, FLAME, LibM)
-# ensure your system supports the target architecture optimizations
When doing so, a number of %Eigen's algorithms are silently substituted with calls to AMD AOCL routines.
These substitutions apply only for \b Dynamic \b or \b large \b enough objects with one of the following standard scalar types: \c float, \c double, \c complex<float>, and \c complex<double>.
Operations on other scalar types or mixing reals and complexes will continue to use the built-in algorithms.
The AOCL integration targets three core components:
- **BLIS**: High-performance BLAS implementation optimized for modern cache hierarchies
- **FLAME**: Dense linear algebra algorithms providing LAPACK functionality
- **LibM**: Optimized standard math routines with vectorized implementations
\section TopicUsingAOCL_Macros Configuration Macros
You can choose which parts will be substituted by defining one or multiple of the following macros:
<table class="manual">
<tr><td>\c EIGEN_USE_BLAS </td><td>Enables the use of external BLAS level 2 and 3 routines (AOCL-BLIS)</td></tr>
<tr class="alt"><td>\c EIGEN_USE_LAPACKE </td><td>Enables the use of external LAPACK routines via the LAPACKE C interface (AOCL-FLAME)</td></tr>
<tr><td>\c EIGEN_USE_LAPACKE_STRICT </td><td>Same as \c EIGEN_USE_LAPACKE but algorithms of lower robustness are disabled. \n This currently concerns only JacobiSVD which would be replaced by \c gesvd.</td></tr>
<tr class="alt"><td>\c EIGEN_USE_AOCL_VML </td><td>Enables the use of AOCL LibM vector math operations for coefficient-wise functions</td></tr>
<tr><td>\c EIGEN_USE_AOCL_ALL </td><td>Defines \c EIGEN_USE_BLAS, \c EIGEN_USE_LAPACKE, and \c EIGEN_USE_AOCL_VML</td></tr>
<tr class="alt"><td>\c EIGEN_USE_AOCL_MT </td><td>Equivalent to \c EIGEN_USE_AOCL_ALL, but ensures multi-threaded BLIS (\c libblis-mt) is used. \n \b Recommended for most applications.</td></tr>
</table>
\note The AOCL integration automatically enables optimizations when the matrix/vector size exceeds \c EIGEN_AOCL_VML_THRESHOLD (default: 128 elements). For smaller operations, Eigen's built-in vectorization may be faster due to function call overhead.
\section TopicUsingAOCL_Performance Performance Considerations
The \c EIGEN_USE_BLAS and \c EIGEN_USE_LAPACKE macros can be combined with AOCL-specific optimizations:
- **Multi-threading**: Use \c EIGEN_USE_AOCL_MT to automatically select the multi-threaded BLIS library
- **Architecture targeting**: AOCL libraries are optimized for AMD Zen architectures (Zen, Zen2, Zen3, Zen4, Zen5)
- **Vector Math Library**: AOCL LibM provides vectorized implementations that can operate on entire arrays simultaneously
- **Memory layout**: Eigen's column-major storage directly matches AOCL's expected data layout for zero-copy operation
\section TopicUsingAOCL_Types Supported Data Types and Sizes
AOCL acceleration is applied to:
- **Scalar types**: \c float, \c double, \c complex<float>, \c complex<double>
- **Matrix/Vector sizes**: Dynamic size or compile-time size ≥ \c EIGEN_AOCL_VML_THRESHOLD
- **Storage order**: Both column-major (default) and row-major layouts
- **Memory alignment**: Eigen's data pointers are directly compatible with AOCL function signatures
The current AOCL Vector Math Library integration is specialized for \c double precision, with automatic fallback to scalar implementations for \c float.
\section TopicUsingAOCL_Functions Vector Math Functions
The following table summarizes coefficient-wise operations accelerated by \c EIGEN_USE_AOCL_VML:
<table class="manual">
<tr><th>Code example</th><th>AOCL routines</th></tr>
<tr><td>\code
v2 = v1.array().exp();
v2 = v1.array().sin();
v2 = v1.array().cos();
v2 = v1.array().tan();
v2 = v1.array().log();
v2 = v1.array().log10();
v2 = v1.array().log2();
v2 = v1.array().sqrt();
v2 = v1.array().pow(1.5);
v2 = v1.array() + v2.array();
\endcode</td><td>\code
amd_vrda_exp
amd_vrda_sin
amd_vrda_cos
amd_vrda_tan
amd_vrda_log
amd_vrda_log10
amd_vrda_log2
amd_vrda_sqrt
amd_vrda_pow
amd_vrda_add
\endcode</td></tr>
</table>
In the examples, v1 and v2 are dense vectors of type \c VectorXd with size ≥ \c EIGEN_AOCL_VML_THRESHOLD.
\section TopicUsingAOCL_Example Complete Example
\code
#define EIGEN_USE_AOCL_MT
#include <iostream>
#include <Eigen/Dense>
int main() {
const int n = 2048;
// Large matrices automatically use AOCL-BLIS for multiplication
Eigen::MatrixXd A = Eigen::MatrixXd::Random(n, n);
Eigen::MatrixXd B = Eigen::MatrixXd::Random(n, n);
Eigen::MatrixXd C = A * B; // Dispatched to dgemm
// Large vectors automatically use AOCL LibM for math functions
Eigen::VectorXd v = Eigen::VectorXd::LinSpaced(10000, 0, 10);
Eigen::VectorXd result = v.array().sin(); // Dispatched to amd_vrda_sin
// LAPACK decompositions use AOCL-FLAME
Eigen::LLT<Eigen::MatrixXd> llt(A); // Dispatched to dpotrf
std::cout << "Matrix norm: " << C.norm() << std::endl;
std::cout << "Vector result norm: " << result.norm() << std::endl;
return 0;
}
\endcode
\section TopicUsingAOCL_Building Building and Linking
To compile with AOCL support, set the \c AOCL_ROOT environment variable and link against the required libraries:
\code
export AOCL_ROOT=/path/to/aocl
clang++ -O3 -g -DEIGEN_USE_AOCL_ALL \
-I./install/include -I${AOCL_ROOT}/include \
-Wno-parentheses my_app.cpp \
-L${AOCL_ROOT} -lamdlibm -lflame -lblis \
-lpthread -lrt -lm -lomp \
-o eigen_aocl_example
\endcode
For multi-threaded performance, use the multi-threaded BLIS library:
\code
clang++ -O3 -g -DEIGEN_USE_AOCL_MT \
-I./install/include -I${AOCL_ROOT}/include \
-Wno-parentheses my_app.cpp \
-L${AOCL_ROOT} -lamdlibm -lflame -lblis-mt \
-lpthread -lrt -lm -lomp \
-o eigen_aocl_example
\endcode
Key compiler and linker flags:
- \c -DEIGEN_USE_AOCL_ALL: Enable all AOCL accelerations (BLAS, LAPACK, VML)
- \c -DEIGEN_USE_AOCL_MT: Enable multi-threaded version (uses \c -lblis-mt)
- \c -lblis: Single-threaded BLIS library
- \c -lblis-mt: Multi-threaded BLIS library (recommended for performance)
- \c -lflame: FLAME LAPACK implementation
- \c -lamdlibm: AMD LibM vector math library
- \c -lomp: OpenMP runtime for multi-threading support
- \c -lpthread -lrt: System threading and real-time libraries
- \c -Wno-parentheses: Suppress common warnings when using AOCL headers
\subsection TopicUsingAOCL_EigenBuild Building Eigen with AOCL Support
To build Eigen with AOCL Support, use the following CMake configuration:
\code
cmake .. -DCMAKE_BUILD_TYPE=Release \
-DCMAKE_C_COMPILER=clang \
-DCMAKE_CXX_COMPILER=clang++ \
-DCMAKE_INSTALL_PREFIX=$PWD/install \
-DINCLUDE_INSTALL_DIR=$PWD/install/include \
&& make install -j$(nproc)
\endcode
To build Eigen with AOCL integration and benchmarking capabilities, use the following CMake configuration:
\code
cmake .. -DEIGEN_BUILD_AOCL_BENCH=ON \
-DEIGEN_AOCL_BENCH_FLAGS="-O3 -mavx512f -fveclib=AMDLIBM" \
-DEIGEN_AOCL_BENCH_USE_MT=OFF \
-DEIGEN_AOCL_BENCH_ARCH=znver5 \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_C_COMPILER=clang \
-DCMAKE_CXX_COMPILER=clang++ \
-DCMAKE_INSTALL_PREFIX=$PWD/install \
-DINCLUDE_INSTALL_DIR=$PWD/install/include \
&& make install -j$(nproc)
\endcode
**CMake Configuration Parameters:**
<table class="manual">
<tr><th>Parameter</th><th>Expected Values</th><th>Description</th></tr>
<tr><td>\c EIGEN_BUILD_AOCL_BENCH</td><td>\c ON, \c OFF</td><td>Enable/disable AOCL benchmark compilation</td></tr>
<tr class="alt"><td>\c EIGEN_AOCL_BENCH_FLAGS</td><td>Compiler flags string</td><td>Additional compiler optimizations: \c "-O3 -mavx512f -fveclib=AMDLIBM"</td></tr>
<tr><td>\c EIGEN_AOCL_BENCH_USE_MT</td><td>\c ON, \c OFF</td><td>Use multi-threaded AOCL libraries (\c ON recommended for performance)</td></tr>
<tr class="alt"><td>\c EIGEN_AOCL_BENCH_ARCH</td><td>\c znver3, \c znver4, \c znver5, \c native, \c generic</td><td>Target AMD architecture (match your CPU generation)</td></tr>
<tr><td>\c CMAKE_BUILD_TYPE</td><td>\c Release, \c Debug, \c RelWithDebInfo</td><td>Build configuration (\c Release recommended for benchmarks)</td></tr>
<tr class="alt"><td>\c CMAKE_C_COMPILER</td><td>\c clang, \c gcc</td><td>C compiler (clang recommended for AOCL)</td></tr>
<tr><td>\c CMAKE_CXX_COMPILER</td><td>\c clang++, \c g++</td><td>C++ compiler (clang++ recommended for AOCL)</td></tr>
<tr class="alt"><td>\c CMAKE_INSTALL_PREFIX</td><td>Installation path</td><td>Where to install Eigen headers</td></tr>
<tr><td>\c INCLUDE_INSTALL_DIR</td><td>Header path</td><td>Specific path for Eigen headers</td></tr>
</table>
**Architecture Selection Guide:**
- \c znver3: AMD Zen 3 (EPYC 7003, Ryzen 5000 series)
- \c znver4: AMD Zen 4 (EPYC 9004, Ryzen 7000 series)
- \c znver5: AMD Zen 5 (EPYC 9005, Ryzen 9000 series)
- \c native: Auto-detect current CPU architecture
- \c generic: Generic x86-64 without specific optimizations
**Custom Compiler Flags Explanation:**
- \c -O3: Maximum optimization level
- \c -mavx512f: Enable AVX-512 instruction set (if supported)
- \c -fveclib=AMDLIBM: Use AMD LibM for vectorized math functions
\subsection TopicUsingAOCL_Benchmark Building the AOCL Benchmark
After configuring Eigen, build the AOCL benchmark executable:
\code
cmake --build . --target benchmark_aocl -j$(nproc)
\endcode
This creates the \c benchmark_aocl executable that demonstrates AOCL acceleration with various matrix sizes and operations.
**Running the Benchmark:**
\code
./benchmark_aocl
\endcode
The benchmark will automatically compare:
- Eigen's native performance vs AOCL-accelerated operations
- Matrix multiplication performance (BLIS vs Eigen)
- Vector math functions performance (LibM vs Eigen)
- Memory bandwidth utilization and cache efficiency
\section TopicUsingAOCL_CMake CMake Integration
When using CMake, you can use a FindAOCL module:
\code
find_package(AOCL REQUIRED)
target_compile_definitions(my_target PRIVATE EIGEN_USE_AOCL_MT)
target_link_libraries(my_target PRIVATE AOCL::BLIS_MT AOCL::FLAME AOCL::LIBM)
\endcode
\section TopicUsingAOCL_Troubleshooting Troubleshooting
Common issues and solutions:
- **Link errors**: Ensure \c AOCL_ROOT is set and libraries are in \c LD_LIBRARY_PATH
- **Performance not improved**: Verify you're using matrices/vectors larger than the threshold
- **Thread contention**: Set \c OMP_NUM_THREADS to match your CPU core count
- **Architecture mismatch**: Use appropriate \c -march flag for your AMD processor
\section TopicUsingAOCL_Links Links
- AMD AOCL can be downloaded for free <a href="https://www.amd.com/en/developer/aocl.html">here</a>
- AOCL User Guide and documentation available on the AMD Developer Portal
- AOCL is also available through package managers and containerized environments
*/
}