diff --git a/CMakeLists.txt b/CMakeLists.txt index 36912bcde..44cf5fe15 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,6 +71,7 @@ endif() option(EIGEN_BUILD_BTL "Build benchmark suite" OFF) option(EIGEN_BUILD_SPBENCH "Build sparse benchmark suite" OFF) +option(EIGEN_BUILD_AOCL_BENCH "Build AOCL benchmark" OFF) # Avoid building docs if included from another project. # Building documentation requires creating and running executables on the host # platform. We shouldn't do this if cross-compiling. @@ -305,17 +306,30 @@ if (EIGEN_IS_BUILDING_) set(CMAKE_INCLUDE_CURRENT_DIR OFF) find_package(StandardMathLibrary) + cmake_policy(SET CMP0074 NEW) + find_package(AOCL QUIET) set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "") - if(NOT STANDARD_MATH_LIBRARY_FOUND) - message(FATAL_ERROR - "Can't link to the standard math library. Please report to the Eigen developers, telling them about your platform.") - else() - if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) - set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO} ${STANDARD_MATH_LIBRARY}") - else() - set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${STANDARD_MATH_LIBRARY}") + if(AOCL_FOUND) + list(APPEND EIGEN_STANDARD_LIBRARIES_TO_LINK_TO ${AOCL_LIBRARIES}) + if(AOCL_INCLUDE_DIRS) + include_directories(${AOCL_INCLUDE_DIRS}) endif() endif() + + if(NOT STANDARD_MATH_LIBRARY_FOUND) + message(FATAL_ERROR + "Can't link to the standard math library. Please report to the Eigen developers, telling them about your platform.") + else() + if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) + set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO} ${STANDARD_MATH_LIBRARY}") + else() + set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${STANDARD_MATH_LIBRARY}") + endif() + # Clean up any leading/trailing whitespace in the variable to avoid CMP0004 errors + string(STRIP "${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}" EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) + endif() + + if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) message(STATUS "Standard libraries to link to explicitly: ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}") else() @@ -743,6 +757,57 @@ endif() if(NOT WIN32 AND EIGEN_BUILD_SPBENCH) add_subdirectory(bench/spbench EXCLUDE_FROM_ALL) endif() +#--------------------------------------------------------------------------------------# +# AOCL BENCHMARK BUILD SECTION # +#--------------------------------------------------------------------------------------# +if(EIGEN_BUILD_AOCL_BENCH) + # Allow users to override the default architecture + set(EIGEN_AOCL_BENCH_ARCH "znver5" CACHE STRING "Target architecture for AOCL benchmark") + add_executable(benchmark_aocl EXCLUDE_FROM_ALL bench/benchmark_aocl.cpp) + include(CheckCXXCompilerFlag) + check_cxx_compiler_flag("-march=${EIGEN_AOCL_BENCH_ARCH}" COMPILER_SUPPORTS_AOCL_ARCH) + if(COMPILER_SUPPORTS_AOCL_ARCH) + target_compile_options(benchmark_aocl PRIVATE -O3 -Wno-shadow -march=${EIGEN_AOCL_BENCH_ARCH}) + else() + message(WARNING "${EIGEN_AOCL_BENCH_ARCH} architecture not supported by compiler") + target_compile_options(benchmark_aocl PRIVATE -O3) + endif() + + # Add custom flags if provided + if(EIGEN_AOCL_BENCH_FLAGS) + separate_arguments(CUSTOM_FLAGS NATIVE_COMMAND "${EIGEN_AOCL_BENCH_FLAGS}") + target_compile_options(benchmark_aocl PRIVATE ${CUSTOM_FLAGS}) + # Check if OpenMP is requested in custom flags and link it + string(FIND "${EIGEN_AOCL_BENCH_FLAGS}" "-fopenmp" OPENMP_REQUESTED) + if(NOT OPENMP_REQUESTED EQUAL -1) + find_package(OpenMP) + if(OpenMP_CXX_FOUND) + target_link_libraries(benchmark_aocl OpenMP::OpenMP_CXX) + else() + # Generic fallback: let compiler handle OpenMP linking + if(MSVC) + target_compile_options(benchmark_aocl PRIVATE "/openmp") + else() + target_compile_options(benchmark_aocl PRIVATE "-fopenmp") + target_link_options(benchmark_aocl PRIVATE "-fopenmp") + endif() + message(STATUS "Using compiler OpenMP flags as fallback") + endif() + endif() + endif() + + target_include_directories(benchmark_aocl PRIVATE ${INCLUDE_INSTALL_DIR}) + if(EIGEN_AOCL_BENCH_USE_MT) + target_compile_definitions(benchmark_aocl PRIVATE EIGEN_USE_AOCL_MT) + else() + target_compile_definitions(benchmark_aocl PRIVATE EIGEN_USE_AOCL_ALL) + endif() + target_link_libraries(benchmark_aocl Eigen3::Eigen) + if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) + target_link_libraries(benchmark_aocl ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}) + endif() +endif() +#----------------------------------------------------------------------------------------# if (EIGEN_BUILD_DEMOS) add_subdirectory(demos EXCLUDE_FROM_ALL) @@ -792,6 +857,9 @@ if(PROJECT_IS_TOP_LEVEL) if (EIGEN_BUILD_LAPACK) message(STATUS "lapack | Build LAPACK subset library (not the same thing as Eigen)") endif() + if(EIGEN_BUILD_AOCL_BENCH) + message(STATUS "benchmark_aocl | Build AOCL benchmark executable") + endif() message(STATUS "------------+--------------------------------------------------------------") message(STATUS "") endif() @@ -799,3 +867,4 @@ endif() message(STATUS "") message(STATUS "Configured Eigen ${EIGEN_VERSION_STRING}") message(STATUS "") + diff --git a/Eigen/Core b/Eigen/Core index 94fd6ecc0..9f81658b0 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -53,6 +53,8 @@ // this include file manages BLAS and MKL related macros // and inclusion of their respective header files #include "src/Core/util/MKL_support.h" +#include "src/Core/util/AOCL_Support.h" // ← ADD THIS + #if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) #define EIGEN_HAS_GPU_FP16 @@ -463,6 +465,10 @@ using std::ptrdiff_t; #include "src/Core/Assign_MKL.h" #endif +#ifdef EIGEN_USE_AOCL_VML +#include "src/Core/Assign_AOCL.h" +#endif + #include "src/Core/GlobalFunctions.h" // IWYU pragma: end_exports diff --git a/Eigen/src/Core/Assign_AOCL.h b/Eigen/src/Core/Assign_AOCL.h new file mode 100644 index 000000000..da3ef7cea --- /dev/null +++ b/Eigen/src/Core/Assign_AOCL.h @@ -0,0 +1,301 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + * + * Assign_AOCL.h - AOCL Vectorized Math Dispatch Layer for Eigen + * + * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + * + * Description: + * ------------ + * This file implements a high-performance dispatch layer that automatically + * routes Eigen's element-wise mathematical operations to AMD Optimizing CPU + * Libraries (AOCL) Vector Math Library (VML) functions when beneficial for + * performance. + * + * The dispatch system uses C++ template specialization to intercept Eigen's + * assignment operations and redirect them to AOCL's VRDA functions, which + * provide optimized implementations for AMD Zen architectures. + * + * Key Features: + * ------------- + * 1. Automatic Dispatch: Seamlessly routes supported operations to AOCL without + * requiring code changes in user applications + * + * 2. Performance Optimization: Uses AOCL VRDA functions optimized for Zen + * family processors with automatic SIMD instruction selection (AVX2, AVX-512) + * + * 3. Threshold-Based Activation: Only activates for vectors larger than + * EIGEN_AOCL_VML_THRESHOLD (default: 128 elements) to avoid overhead on + * small vectors + * + * 4. Precision-Specific Handling: + * - Double precision: AOCL VRDA vectorized functions + * - Single precision: Scalar fallback (preserves correctness) + * + * 5. Memory Layout Compatibility: Ensures direct memory access and compatible + * storage orders between source and destination for optimal performance + * + * Supported Operations: + * --------------------- + * UNARY OPERATIONS (vector → vector): + * - Transcendental: exp(), sin(), cos(), sqrt(), log(), log10(), log2() + * + * BINARY OPERATIONS (vector op vector → vector): + * - Arithmetic: +, *, pow() + * + * Template Specialization Mechanism: + * ----------------------------------- + * The system works by specializing Eigen's Assignment template for: + * 1. CwiseUnaryOp with scalar_*_op functors (unary operations) + * 2. CwiseBinaryOp with scalar_*_op functors (binary operations) + * 3. Dense2Dense assignment context with AOCL-compatible traits + * + * Dispatch conditions (all must be true): + * - Source and destination have DirectAccessBit (contiguous memory) + * - Compatible storage orders (both row-major or both column-major) + * - Vector size ≥ EIGEN_AOCL_VML_THRESHOLD or Dynamic size + * - Supported data type (currently double precision for VRDA) + * + * Integration Example: + * -------------------- + * // Standard Eigen code - no changes required + * VectorXd x = VectorXd::Random(10000); + * VectorXd y = VectorXd::Random(10000); + * VectorXd result; + * + * // These operations are automatically dispatched to AOCL: + * result = x.array().exp(); // → amd_vrda_exp() + * result = x.array().sin(); // → amd_vrda_sin() + * result = x.array() + y.array(); // → amd_vrda_add() + * result = x.array().pow(y.array()); // → amd_vrda_pow() + * + * Configuration: + * -------------- + * Required preprocessor definitions: + * - EIGEN_USE_AOCL_ALL or EIGEN_USE_AOCL_MT: Enable AOCL integration + * - EIGEN_USE_AOCL_VML: Enable Vector Math Library dispatch + * + * Compilation Requirements: + * ------------------------- + * Include paths: + * - AOCL headers: -I${AOCL_ROOT}/include + * - Eigen headers: -I/path/to/eigen + * + * Link libraries: + * - AOCL MathLib: -lamdlibm + * - Standard math: -lm + * + * Compiler flags: + * - Optimization: -O3 (required for inlining) + * - Architecture: -march=znver5 or -march=native + * - Vectorization: -mfma -mavx512f (if supported) + * + * Platform Support: + * ------------------ + * - Primary: Linux x86_64 with AMD Zen family processors + * - Compilers: GCC 8+, Clang 10+, AOCC (recommended) + * - AOCL Version: 4.0+ (with VRDA support) + * + * Error Handling: + * --------------- + * - Graceful fallback to scalar operations for unsupported configurations + * - Compile-time detection of AOCL availability + * - Runtime size and alignment validation with eigen_assert() + * + * Developer: + * ---------- + * Name: Sharad Saurabh Bhaskar + * Email: shbhaska@amd.com + * Organization: Advanced Micro Devices, Inc. + */ + + +#ifndef EIGEN_ASSIGN_AOCL_H +#define EIGEN_ASSIGN_AOCL_H + +namespace Eigen { +namespace internal { + +// Traits for unary operations. +template class aocl_assign_traits { +private: + enum { + DstHasDirectAccess = !!(Dst::Flags & DirectAccessBit), + SrcHasDirectAccess = !!(Src::Flags & DirectAccessBit), + StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Src::IsRowMajor)), + InnerSize = Dst::IsVectorAtCompileTime ? int(Dst::SizeAtCompileTime) + : (Dst::Flags & RowMajorBit) ? int(Dst::ColsAtCompileTime) + : int(Dst::RowsAtCompileTime), + LargeEnough = + (InnerSize == Dynamic) || (InnerSize >= EIGEN_AOCL_VML_THRESHOLD) + }; + +public: + enum { + EnableAoclVML = DstHasDirectAccess && SrcHasDirectAccess && + StorageOrdersAgree && LargeEnough, + Traversal = LinearTraversal + }; +}; + +// Traits for binary operations (e.g., add, pow). +template +class aocl_assign_binary_traits { +private: + enum { + DstHasDirectAccess = !!(Dst::Flags & DirectAccessBit), + LhsHasDirectAccess = !!(Lhs::Flags & DirectAccessBit), + RhsHasDirectAccess = !!(Rhs::Flags & DirectAccessBit), + StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Lhs::IsRowMajor)) && + (int(Dst::IsRowMajor) == int(Rhs::IsRowMajor)), + InnerSize = Dst::IsVectorAtCompileTime ? int(Dst::SizeAtCompileTime) + : (Dst::Flags & RowMajorBit) ? int(Dst::ColsAtCompileTime) + : int(Dst::RowsAtCompileTime), + LargeEnough = + (InnerSize == Dynamic) || (InnerSize >= EIGEN_AOCL_VML_THRESHOLD) + }; + +public: + enum { + EnableAoclVML = DstHasDirectAccess && LhsHasDirectAccess && + RhsHasDirectAccess && StorageOrdersAgree && LargeEnough + }; +}; + +// Unary operation dispatch for float (scalar fallback). +#define EIGEN_AOCL_VML_UNARY_CALL_FLOAT(EIGENOP) \ + template \ + struct Assignment< \ + DstXprType, CwiseUnaryOp, SrcXprNested>, \ + assign_op, Dense2Dense, \ + std::enable_if_t< \ + aocl_assign_traits::EnableAoclVML>> { \ + typedef CwiseUnaryOp, SrcXprNested> \ + SrcXprType; \ + static void run(DstXprType &dst, const SrcXprType &src, \ + const assign_op &) { \ + eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ + Eigen::Index n = dst.size(); \ + if (n <= 0) \ + return; \ + const float *input = \ + reinterpret_cast(src.nestedExpression().data()); \ + float *output = reinterpret_cast(dst.data()); \ + for (Eigen::Index i = 0; i < n; ++i) { \ + output[i] = std::EIGENOP(input[i]); \ + } \ + } \ + }; + +// Unary operation dispatch for double (AOCL vectorized). +#define EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(EIGENOP, AOCLOP) \ + template \ + struct Assignment< \ + DstXprType, CwiseUnaryOp, SrcXprNested>, \ + assign_op, Dense2Dense, \ + std::enable_if_t< \ + aocl_assign_traits::EnableAoclVML>> { \ + typedef CwiseUnaryOp, SrcXprNested> \ + SrcXprType; \ + static void run(DstXprType &dst, const SrcXprType &src, \ + const assign_op &) { \ + eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ + Eigen::Index n = dst.size(); \ + eigen_assert(n <= INT_MAX && "AOCL does not support arrays larger than INT_MAX"); \ + if (n <= 0) \ + return; \ + const double *input = \ + reinterpret_cast(src.nestedExpression().data()); \ + double *output = reinterpret_cast(dst.data()); \ + int aocl_n = internal::convert_index(n); \ + AOCLOP(aocl_n, const_cast(input), output); \ + } \ + }; + +// Instantiate unary calls for float (scalar). +// EIGEN_AOCL_VML_UNARY_CALL_FLOAT(exp) + +// Instantiate unary calls for double (AOCL vectorized). +EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(exp2, amd_vrda_exp2) +EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(exp, amd_vrda_exp) +EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(sin, amd_vrda_sin) +EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(cos, amd_vrda_cos) +EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(sqrt, amd_vrda_sqrt) +EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(cbrt, amd_vrda_cbrt) +EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(abs, amd_vrda_fabs) +EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(log, amd_vrda_log) +EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(log10, amd_vrda_log10) +EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(log2, amd_vrda_log2) + +// Binary operation dispatch for float (scalar fallback). +#define EIGEN_AOCL_VML_BINARY_CALL_FLOAT(EIGENOP, STDFUNC) \ + template \ + struct Assignment< \ + DstXprType, \ + CwiseBinaryOp, LhsXprNested, \ + RhsXprNested>, \ + assign_op, Dense2Dense, \ + std::enable_if_t::EnableAoclVML>> { \ + typedef CwiseBinaryOp, LhsXprNested, \ + RhsXprNested> \ + SrcXprType; \ + static void run(DstXprType &dst, const SrcXprType &src, \ + const assign_op &) { \ + eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ + Eigen::Index n = dst.size(); \ + if (n <= 0) \ + return; \ + const float *lhs = reinterpret_cast(src.lhs().data()); \ + const float *rhs = reinterpret_cast(src.rhs().data()); \ + float *output = reinterpret_cast(dst.data()); \ + for (Eigen::Index i = 0; i < n; ++i) { \ + output[i] = STDFUNC(lhs[i], rhs[i]); \ + } \ + } \ + }; + +// Binary operation dispatch for double (AOCL vectorized). +#define EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(EIGENOP, AOCLOP) \ + template \ + struct Assignment< \ + DstXprType, \ + CwiseBinaryOp, LhsXprNested, \ + RhsXprNested>, \ + assign_op, Dense2Dense, \ + std::enable_if_t::EnableAoclVML>> { \ + typedef CwiseBinaryOp, LhsXprNested, \ + RhsXprNested> \ + SrcXprType; \ + static void run(DstXprType &dst, const SrcXprType &src, \ + const assign_op &) { \ + eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ + Eigen::Index n = dst.size(); \ + eigen_assert(n <= INT_MAX && "AOCL does not support arrays larger than INT_MAX"); \ + if (n <= 0) \ + return; \ + const double *lhs = reinterpret_cast(src.lhs().data()); \ + const double *rhs = reinterpret_cast(src.rhs().data()); \ + double *output = reinterpret_cast(dst.data()); \ + int aocl_n = internal::convert_index(n); \ + AOCLOP(aocl_n, const_cast(lhs), const_cast(rhs), output); \ + } \ + }; + +// Instantiate binary calls for float (scalar). +// EIGEN_AOCL_VML_BINARY_CALL_FLOAT(sum, std::plus) // Using +// scalar_sum_op for addition EIGEN_AOCL_VML_BINARY_CALL_FLOAT(pow, std::pow) + +// Instantiate binary calls for double (AOCL vectorized). +EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(sum, amd_vrda_add) // Using scalar_sum_op for addition +EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(pow, amd_vrda_pow) +EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(max, amd_vrda_fmax) +EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(min, amd_vrda_fmin) + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_ASSIGN_AOCL_H diff --git a/Eigen/src/Core/util/AOCL_Support.h b/Eigen/src/Core/util/AOCL_Support.h new file mode 100644 index 000000000..434ccfd6c --- /dev/null +++ b/Eigen/src/Core/util/AOCL_Support.h @@ -0,0 +1,175 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + * + * AOCL_Support.h - AMD Optimizing CPU Libraries Integration Header for Eigen + * + * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + * + * Description: + * ------------ + * This header file serves as the central configuration and integration point + * for AMD Optimizing CPU Libraries (AOCL) with the Eigen C++ template library. + * It orchestrates the integration of multiple AOCL components to provide + * optimal mathematical computing performance on AMD Zen family processors. + * + * AOCL Component Integration: + * --------------------------- + * 1. AOCL Vector Math Library (VML): + * - Provides VRDA (Vector Rapid Double-precision Arithmetic) functions + * - Optimized transcendental functions: exp, sin, cos, sqrt, log, pow, etc. + * - SIMD vectorization for AMD architectures (AVX2, AVX-512) + * - Headers: amdlibm.h, amdlibm_vec.h + * + * 2. AOCL BLAS (BLIS - BLAS-like Library Instantiation Software): + * - High-performance Basic Linear Algebra Subprograms + * - Supports single-threaded (libblis) and multithreaded (libblis-mt) + * variants + * - Optimized matrix operations: GEMM, GEMV, TRSM, etc. + * - Headers: cblas.h, blis.h + * + * 3. AOCL LAPACK (libFLAME - Formal Linear Algebra Methods Environment): + * - Dense linear algebra operations: factorizations, eigenvalue solvers + * - Matrix decompositions: LU, Cholesky, QR, SVD + * - Eigenvalue/eigenvector computations optimized for AMD hardware + * - Headers: LAPACKE interface + * + * ------------------------------ + * EIGEN_AOCL_VML_THRESHOLD (default: 128): + * - Minimum vector size for AOCL VML dispatch + * - Smaller vectors use standard Eigen to avoid function call overhead + * - Optimal values: 64-512 depending on operation and data characteristics + * + * + * + * Architecture Support: + * --------------------- + * Optimized for AMD processor families: + * - Zen Architecture (Naples, Rome): AVX2 optimization + * - Zen 2 Architecture (Rome, Matisse): Enhanced AVX2 + * - Zen 3 Architecture (Milan, Vermeer): Improved IPC and cache + * - Zen 4 Architecture (Genoa, Raphael): AVX-512 support + * - Zen 5 Architecture (Turin, Granite Ridge): Enhanced AVX-512 + * + * + * Dependencies: + * ------------- + * Required AOCL components: + * - libamdlibm: Core math library with VRDA functions + * - libblis or libblis-mt: BLAS implementation + * - libflame: LAPACK implementation + * + * System requirements: + * - AMD x86_64 processor (optimal performance) + * - Linux, Windows, or compatible POSIX system + * - C++11 or later standard + * - CMake 3.5+ for build system integration + * + * Developer: + * ---------- + * Name: Sharad Saurabh Bhaskar + * Email: shbhaska@amd.com + * Organization: Advanced Micro Devices, Inc. + */ + +#ifndef EIGEN_AOCL_SUPPORT_H +#define EIGEN_AOCL_SUPPORT_H + +#if defined(EIGEN_USE_AOCL_ALL) || defined(EIGEN_USE_AOCL_MT) + +#include + +// Define AOCL component flags based on main flags +#ifdef EIGEN_USE_AOCL_ALL +#define EIGEN_USE_AOCL_VML // Enable AOCL Vector Math Library +#define EIGEN_USE_AOCL_BLAS // Enable AOCL BLAS (BLIS) + +// Enable Eigen BLAS backend only if BLIS provides compatible interface +#if defined(EIGEN_AOCL_BLIS_COMPATIBLE) +#define EIGEN_USE_BLAS // Enable Eigen BLAS backend +#endif + +#define EIGEN_USE_LAPACKE // Enable LAPACK backend (FLAME) +#endif + +#ifdef EIGEN_USE_AOCL_MT +#define EIGEN_USE_AOCL_VML // Enable AOCL Vector Math Library +#define EIGEN_USE_AOCL_BLAS // Enable AOCL BLAS (BLIS) + +// For multithreaded: disable EIGEN_USE_BLAS to avoid signature conflicts +// Use direct BLIS calls instead through EIGEN_USE_AOCL_BLAS +// #define EIGEN_USE_BLAS // Commented out - causes conflicts with BLIS +// interface + +// Note: LAPACKE disabled in MT mode to avoid header conflicts +#define EIGEN_USE_LAPACKE // Commented out - causes conflicts with BLIS LAPACKE +#define EIGEN_AOCL_USE_BLIS_MT 1 // Enable multithreaded BLIS +#endif + +// Handle standalone EIGEN_USE_AOCL_VML flag +#ifndef EIGEN_USE_AOCL_VML +#ifdef EIGEN_USE_AOCL_ALL +#define EIGEN_USE_AOCL_VML +#endif +#ifdef EIGEN_USE_AOCL_MT +#define EIGEN_USE_AOCL_VML +#endif +#endif + +// Configuration constants - define these for any AOCL usage +#ifndef EIGEN_AOCL_VML_THRESHOLD +#define EIGEN_AOCL_VML_THRESHOLD 128 // Threshold for VML dispatch +#endif + +#ifndef AOCL_SIMD_WIDTH +#define AOCL_SIMD_WIDTH 8 // AVX-512: 512 bits / 64 bits per double +#endif + +// Include AOCL Math Library headers for VML +#if defined(EIGEN_USE_AOCL_VML) || defined(EIGEN_USE_AOCL_ALL) || \ + defined(EIGEN_USE_AOCL_MT) +#if defined(__has_include) +#if __has_include("amdlibm.h") +#include "amdlibm.h" +#ifndef AMD_LIBM_VEC_EXPERIMENTAL +#define AMD_LIBM_VEC_EXPERIMENTAL +#endif +#if __has_include("amdlibm_vec.h") +#include "amdlibm_vec.h" +#endif +#endif +#else +// Fallback for compilers without __has_include +#include "amdlibm.h" +#ifndef AMD_LIBM_VEC_EXPERIMENTAL +#define AMD_LIBM_VEC_EXPERIMENTAL +#endif +#include "amdlibm_vec.h" +#endif +#endif + +// Include CBLAS headers when BLAS is enabled +#ifdef EIGEN_USE_AOCL_BLAS +#if defined(__has_include) +#if __has_include("cblas.h") +#include "cblas.h" +#elif __has_include("blis.h") +#include "blis.h" +#endif +#else +// Fallback +#include "cblas.h" +#endif +#endif + +namespace Eigen { +// AOCL-specific type definitions +typedef std::complex dcomplex; +typedef std::complex scomplex; +typedef int BlasIndex; // Standard BLAS index type +} // namespace Eigen + +#endif // EIGEN_USE_AOCL_ALL || EIGEN_USE_AOCL_MT + +#endif // EIGEN_AOCL_SUPPORT_H diff --git a/bench/README.txt b/bench/README.txt index 39831ae8a..49c278f58 100644 --- a/bench/README.txt +++ b/bench/README.txt @@ -53,3 +53,56 @@ $ ./bench_multi_compilers.sh ompbench.cxxlist ompbenchmark.cpp +************************ +* benchmark_aocl * +************************ + +This benchmark exercises Eigen operations using AMD Optimized Libraries +(AOCL). It is disabled by default and can be enabled when configuring the +build: + + cmake .. -DEIGEN_BUILD_AOCL_BENCH=ON + +The resulting `benchmark_aocl` target is compiled with `-O3` and, if the +compiler supports it, `-march=znver5` for optimal performance on AMD +processors. + +The benchmark also links to `libblis-mt.so` and `libflame.so` so BLAS and +LAPACK operations run with multithreaded AOCL when available. + +By default the CMake build defines `EIGEN_USE_AOCL_MT` via the option +`EIGEN_AOCL_BENCH_USE_MT` (enabled). Set this option to `OFF` if you want +to build the benchmark using the single-threaded AOCL libraries instead, +in which case `EIGEN_USE_AOCL_ALL` is defined. + + + +Alternatively you can build the same benchmark using the +`Makefile` in this directory. This allows experimenting with +different compiler flags without reconfiguring CMake: + +``` +cd bench && make # builds with -O3 -march=znver5 by default +make clean && make CXX="clang++" ## For differnt compiler apart from g++ +make clean && make MARCH="" CXXFLAGS="-O2" # example of custom flags +make AOCL_ROOT=/opt/aocl # use AOCL from a custom location + +This Makefile links against `libblis-mt.so` and `libflame.so` so the +matrix multiplication benchmark exercises multithreaded BLIS when +`EIGEN_USE_AOCL_MT` is defined (enabled by default in the Makefile). + +If you prefer to compile manually, ensure that the Eigen include path +points to the directory where `AOCL_Support.h` resides. For example: + + +clang++ -O3 -std=c++14 -I../build/install/include \ + -march=znver5 -DEIGEN_USE_AOCL_MT \ + benchmark_aocl.cpp -o benchmark_aocl \ + -lblis-mt -lflame -lamdlibm -lpthread -lm +``` +Replace `../install/include` with your actual Eigen install path. + +When invoking `make`, you can point `AOCL_ROOT` to your AOCL +installation directory so the Makefile links against `$(AOCL_ROOT)/lib`. + + diff --git a/bench/benchmark_aocl.cpp b/bench/benchmark_aocl.cpp new file mode 100644 index 000000000..33d7af217 --- /dev/null +++ b/bench/benchmark_aocl.cpp @@ -0,0 +1,362 @@ +/* + * benchmark_aocl.cpp - AOCL Performance Benchmark Suite for Eigen + * + * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Description: + * ------------ + * This benchmark suite evaluates the performance of Eigen mathematical + * operations when integrated with AMD Optimizing CPU Libraries (AOCL). It + * tests: + * + * 1. Vector Math Operations: Transcendental functions (exp, sin, cos, sqrt, + * log, etc.) using AOCL Vector Math Library (VML) for optimized + * double-precision operations + * + * 2. Matrix Operations: BLAS Level-3 operations (DGEMM) using AOCL BLAS library + * with support for both single-threaded and multithreaded execution + * + * 3. Linear Algebra: LAPACK operations (eigenvalue decomposition) using + * libflame + * + * 4. Real-world Scenarios: Financial risk computation simulating covariance + * matrix calculations and eigenvalue analysis for portfolio optimization + * + * The benchmark automatically detects AOCL configuration and adjusts test + * execution accordingly, providing performance comparisons between standard + * Eigen operations and AOCL-accelerated implementations. + * + * Compilation: + * ------------ + * # Using AOCC compiler (recommended for best AOCL compatibility): + * clang++ -O3 -g -DEIGEN_USE_AOCL_ALL -I + * -I${AOCL_ROOT}/include \ + * -Wno-parentheses src/benchmark_aocl.cpp -L${AOCL_ROOT}/lib \ + * -lamdlibm -lm -lblis -lflame -lpthread -lrt -pthread \ + * -o build/eigen_aocl_benchmark + * + * # Alternative: Using GCC with proper library paths: + * g++ -O3 -g -DEIGEN_USE_AOCL_ALL -I + * -I${AOCL_ROOT}/include \ + * -Wno-parentheses src/benchmark_aocl.cpp -L${AOCL_ROOT}/lib \ + * -lamdlibm -lm -lblis -lflame -lpthread -lrt \ + * -o build/eigen_aocl_benchmark + * + * # For multithreaded BLIS support: + * clang++ -O3 -g -fopenmp -DEIGEN_USE_AOCL_MT -I \ + * -I${AOCL_ROOT}/include -Wno-parentheses src/benchmark_aocl.cpp \ + * -L${AOCL_ROOT}/lib -lamdlibm -lm -lblis-mt -lflame -lpthread -lrt \ + * -o build/eigen_aocl_benchmark_mt + * + * Usage: + * ------ + * export AOCL_ROOT=/path/to/aocl/installation + * export LD_LIBRARY_PATH=$AOCL_ROOT/lib:$LD_LIBRARY_PATH + * ./build/eigen_aocl_benchmark + * + * Developer: + * ---------- + * Name: Sharad Saurabh Bhaskar + * Email: shbhaska@amd.com + * Organization: Advanced Micro Devices, Inc. + */ + +#include +#include +#include +#include +#include + +// Simple - just include Eigen headers +#include +#include +#include + +// Only include CBLAS if AOCL BLIS is available +#ifdef EIGEN_USE_AOCL_ALL +#include +#endif + +using namespace std; +using namespace std::chrono; +using namespace Eigen; + +void benchmarkVectorMath(int size) { + VectorXd v = VectorXd::LinSpaced(size, 0.1, 10.0); + VectorXd result(size); + double elapsed_ms = 0; + + cout << "\n--- Vector Math Benchmark (size = " << size << ") ---" << endl; + + auto start = high_resolution_clock::now(); + result = v.array().exp(); + auto end = high_resolution_clock::now(); + elapsed_ms = duration_cast(end - start).count(); + cout << "exp() time: " << elapsed_ms << " ms" << endl; + + start = high_resolution_clock::now(); + result = v.array().sin(); + end = high_resolution_clock::now(); + elapsed_ms = duration_cast(end - start).count(); + cout << "sin() time: " << elapsed_ms << " ms" << endl; + + start = high_resolution_clock::now(); + result = v.array().cos(); + end = high_resolution_clock::now(); + elapsed_ms = duration_cast(end - start).count(); + cout << "cos() time: " << elapsed_ms << " ms" << endl; + + start = high_resolution_clock::now(); + result = v.array().sqrt(); + end = high_resolution_clock::now(); + elapsed_ms = duration_cast(end - start).count(); + cout << "sqrt() time: " << elapsed_ms << " ms" << endl; + + start = high_resolution_clock::now(); + result = v.array().cbrt(); + end = high_resolution_clock::now(); + elapsed_ms = duration_cast(end - start).count(); + cout << "cbrt() time: " << elapsed_ms << " ms" << endl; + + start = high_resolution_clock::now(); + result = v.array().abs(); + end = high_resolution_clock::now(); + elapsed_ms = duration_cast(end - start).count(); + cout << "abs() time: " << elapsed_ms << " ms" << endl; + + start = high_resolution_clock::now(); + result = v.array().log(); + end = high_resolution_clock::now(); + elapsed_ms = duration_cast(end - start).count(); + cout << "log() time: " << elapsed_ms << " ms" << endl; + + start = high_resolution_clock::now(); + result = v.array().log10(); + end = high_resolution_clock::now(); + elapsed_ms = duration_cast(end - start).count(); + cout << "log10() time: " << elapsed_ms << " ms" << endl; + + start = high_resolution_clock::now(); + result = v.array().exp2(); + end = high_resolution_clock::now(); + elapsed_ms = duration_cast(end - start).count(); + cout << "exp2() time: " << elapsed_ms << " ms" << endl; + + start = high_resolution_clock::now(); + result = v.array().asin(); + end = high_resolution_clock::now(); + elapsed_ms = duration_cast(end - start).count(); + cout << "asin() time: " << elapsed_ms << " ms" << endl; + + start = high_resolution_clock::now(); + result = v.array().sinh(); + end = high_resolution_clock::now(); + elapsed_ms = duration_cast(end - start).count(); + cout << "sinh() time: " << elapsed_ms << " ms" << endl; + + start = high_resolution_clock::now(); + result = v.array().acos(); + end = high_resolution_clock::now(); + elapsed_ms = duration_cast(end - start).count(); + cout << "acos() time: " << elapsed_ms << " ms" << endl; + + start = high_resolution_clock::now(); + result = v.array().cosh(); + end = high_resolution_clock::now(); + elapsed_ms = duration_cast(end - start).count(); + cout << "cosh() time: " << elapsed_ms << " ms" << endl; + + start = high_resolution_clock::now(); + result = v.array().tan(); + end = high_resolution_clock::now(); + elapsed_ms = duration_cast(end - start).count(); + cout << "tan() time: " << elapsed_ms << " ms" << endl; + + start = high_resolution_clock::now(); + result = v.array().atan(); + end = high_resolution_clock::now(); + elapsed_ms = duration_cast(end - start).count(); + cout << "atan() time: " << elapsed_ms << " ms" << endl; + + start = high_resolution_clock::now(); + result = v.array().tanh(); + end = high_resolution_clock::now(); + elapsed_ms = duration_cast(end - start).count(); + cout << "tanh() time: " << elapsed_ms << " ms" << endl; + + VectorXd v2 = VectorXd::Random(size); + start = high_resolution_clock::now(); + result = v.array() + v2.array(); + end = high_resolution_clock::now(); + elapsed_ms = duration_cast(end - start).count(); + cout << "add() time: " << elapsed_ms << " ms" << endl; + + start = high_resolution_clock::now(); + result = v.array().pow(2.0); + end = high_resolution_clock::now(); + elapsed_ms = duration_cast(end - start).count(); + cout << "pow() time: " << elapsed_ms << " ms" << endl; + + start = high_resolution_clock::now(); + result = v.array().max(v2.array()); + end = high_resolution_clock::now(); + elapsed_ms = duration_cast(end - start).count(); + cout << "max() time: " << elapsed_ms << " ms" << endl; + + start = high_resolution_clock::now(); + result = v.array().min(v2.array()); + end = high_resolution_clock::now(); + elapsed_ms = duration_cast(end - start).count(); + cout << "min() time: " << elapsed_ms << " ms" << endl; +} + +// Function to benchmark BLAS operation: Matrix multiplication. +void benchmarkMatrixMultiplication(int matSize) { + cout << "\n--- BLIS-st DGEMM Benchmark (" << matSize << " x " << matSize + << ") ---" << endl; + + MatrixXd A = MatrixXd::Random(matSize, matSize); + MatrixXd B = MatrixXd::Random(matSize, matSize); + MatrixXd C(matSize, matSize); + + auto start = high_resolution_clock::now(); + C = A * B; + auto end = high_resolution_clock::now(); + double elapsed_ms = duration_cast(end - start).count(); + cout << "Matrix multiplication time: " << elapsed_ms << " ms" << endl; +} + +// Benchmark BLIS directly using its CBLAS interface if available. +void benchmarkBlisMultithreaded(int matSize, int numThreads) { +#if defined(EIGEN_AOCL_USE_BLIS_MT) + cout << "\n--- BLIS-mt DGEMM Benchmark (" << matSize << " x " << matSize + << ", threads=" << numThreads << ") ---" << endl; + vector A(matSize * matSize); + vector B(matSize * matSize); + vector C(matSize * matSize); + for (auto &v : A) + v = static_cast(rand()) / RAND_MAX; + for (auto &v : B) + v = static_cast(rand()) / RAND_MAX; + double alpha = 1.0, beta = 0.0; + string th = to_string(numThreads); + setenv("BLIS_NUM_THREADS", th.c_str(), 1); + auto start = high_resolution_clock::now(); + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, matSize, matSize, + matSize, alpha, A.data(), matSize, B.data(), matSize, beta, + C.data(), matSize); + auto end = high_resolution_clock::now(); + double elapsed_ms = duration_cast(end - start).count(); + cout << "BLIS dgemm time: " << elapsed_ms << " ms" << endl; +#else + (void)matSize; + (void)numThreads; + cout << "\nBLIS multithreaded support not enabled." << endl; +#endif +} + +// Function to benchmark LAPACK operation: Eigenvalue decomposition. +void benchmarkEigenDecomposition(int matSize) { + cout << "\n--- Eigenvalue Decomposition Benchmark (Matrix Size: " << matSize + << " x " << matSize << ") ---" << endl; + MatrixXd M = MatrixXd::Random(matSize, matSize); + // Make matrix symmetric (necessary for eigenvalue decomposition of + // self-adjoint matrices) + M = (M + M.transpose()) * 0.5; + + SelfAdjointEigenSolver eigensolver; + auto start = high_resolution_clock::now(); + eigensolver.compute(M); + auto end = high_resolution_clock::now(); + double elapsed_ms = duration_cast(end - start).count(); + if (eigensolver.info() == Success) { + cout << "Eigenvalue decomposition time: " << elapsed_ms << " ms" << endl; + } else { + cout << "Eigenvalue decomposition failed." << endl; + } +} + +// Function simulating a real-world FSI risk computation scenario. +// Example: Compute covariance matrix from simulated asset returns, then perform +// eigenvalue decomposition. +void benchmarkFSIRiskComputation(int numPeriods, int numAssets) { + cout << "\n--- FSI Risk Computation Benchmark ---" << endl; + cout << "Simulating " << numPeriods << " periods for " << numAssets + << " assets." << endl; + + // Simulate asset returns: each column represents an asset's returns. + MatrixXd returns = MatrixXd::Random(numPeriods, numAssets); + + // Compute covariance matrix: cov = (returns^T * returns) / (numPeriods - 1) + auto start = high_resolution_clock::now(); + MatrixXd cov = (returns.transpose() * returns) / (numPeriods - 1); + auto end = high_resolution_clock::now(); + double cov_time = duration_cast(end - start).count(); + cout << "Covariance matrix computation time: " << cov_time << " ms" << endl; + + // Eigenvalue decomposition on covariance matrix. + SelfAdjointEigenSolver eigensolver; + start = high_resolution_clock::now(); + eigensolver.compute(cov); + end = high_resolution_clock::now(); + double eig_time = duration_cast(end - start).count(); + if (eigensolver.info() == Success) { + cout << "Eigenvalue decomposition (covariance) time: " << eig_time << " ms" + << endl; + cout << "Top 3 Eigenvalues: " + << eigensolver.eigenvalues().tail(3).transpose() << endl; + } else { + cout << "Eigenvalue decomposition failed." << endl; + } +} + +int main() { + cout << "=== AOCL Benchmark for Eigen on AMD Platforms ===" << endl; + cout << "Developer: Sharad Saurabh Bhaskar (shbhaska@amd.com)" << endl; + cout << "Organization: Advanced Micro Devices, Inc." << endl; + cout << "License: Mozilla Public License 2.0" << endl << endl; + + // Print AOCL configuration +#ifdef EIGEN_USE_AOCL_MT + cout << "AOCL Mode: MULTITHREADED (MT)" << endl; + cout << "Features: Multithreaded BLIS, AOCL VML, LAPACK" << endl; +#elif defined(EIGEN_USE_AOCL_ALL) + cout << "AOCL Mode: SINGLE-THREADED (ALL)" << endl; + cout << "Features: Single-threaded BLIS, AOCL VML, LAPACK" << endl; +#else + cout << "AOCL Mode: DISABLED" << endl; + cout << "Using standard Eigen implementation" << endl; +#endif + cout << "Hardware threads available: " << thread::hardware_concurrency() << endl << endl; + + // Benchmark vector math functions with varying vector sizes. + vector vectorSizes = {5000000, 10000000, 50000000}; + for (int size : vectorSizes) { + benchmarkVectorMath(size); + } + + // Benchmark matrix multiplication for varying sizes. + vector matrixSizes = {1024}; + for (int msize : matrixSizes) { + benchmarkMatrixMultiplication(msize); +#if defined(EIGEN_AOCL_USE_BLIS_MT) + benchmarkBlisMultithreaded(msize, thread::hardware_concurrency()); +#endif + } + + // Benchmark LAPACK: Eigenvalue Decomposition. + for (int msize : matrixSizes) { + benchmarkEigenDecomposition(msize); + } + + // Benchmark a complex FSI risk computation scenario. + // For example, simulate 10,000 time periods (days) for 500 assets. + benchmarkFSIRiskComputation(10000, 500); + + cout << "\n=== Benchmark Complete ===" << endl; + return 0; +} diff --git a/cmake/FindAOCL.cmake b/cmake/FindAOCL.cmake new file mode 100644 index 000000000..dd25ac951 --- /dev/null +++ b/cmake/FindAOCL.cmake @@ -0,0 +1,264 @@ + +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# +# FindAOCL.cmake - CMake Module for AMD Optimizing CPU Libraries (AOCL) +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Description: +# ------------ +# This CMake module locates and configures AMD Optimizing CPU Libraries (AOCL) +# for high-performance mathematical computing on AMD processors. It searches for +# and sets up the following AOCL components: +# +# 1. AOCL MathLib (libamdlibm): Vector Math Library providing optimized +# transcendental functions (exp, sin, cos, sqrt, log, etc.) with VRDA +# (Vector Rapid Double-precision Arithmetic) support for SIMD acceleration +# +# 2. AOCL BLAS (BLIS): Basic Linear Algebra Subprograms optimized for AMD +# architectures, supporting both single-threaded (libblis) and multithreaded +# (libblis-mt) execution with OpenMP parallelization +# +# 3. AOCL LAPACK (libflame): Linear Algebra PACKage providing dense matrix +# factorizations, eigenvalue/eigenvector computations, and linear system +# solvers optimized for AMD processors +# +# The module automatically detects the appropriate library variants based on +# configuration flags and provides proper linking setup for optimal performance +# on Zen, Zen2, Zen3, Zen4, and Zen5 architectures. +# +# Variables Set: +# -------------- +# AOCL_FOUND - True if AOCL libraries are found +# AOCL_LIBRARIES - List of AOCL libraries to link against +# AOCL_INCLUDE_DIRS - Include directories for AOCL headers +# AOCL_BLAS_TYPE - Type of BLIS library found ("multithreaded" or "single-threaded") +# AOCL_CORE_LIB - Path to core AOCL math library +# AOCL_BLAS_LIB - Path to AOCL BLAS library +# AOCL_LAPACK_LIB - Path to AOCL LAPACK library +# +# Configuration Options: +# ---------------------- +# EIGEN_AOCL_BENCH_USE_MT - When ON, searches for multithreaded BLIS first +# When OFF, searches for single-threaded BLIS only +# +# # For multithreaded BLIS: +# cmake .. -DEIGEN_AOCL_BENCH_USE_MT=ON +# +# # For single-threaded BLIS: +# cmake .. -DEIGEN_AOCL_BENCH_USE_MT=OFF +# +# Library Search Paths: +# --------------------- +# The module searches for AOCL libraries in the following order: +# 1. ${AOCL_ROOT}/lib (or ${AOCL_ROOT}/lib32 for 32-bit) +# 2. /opt/amd/aocl/lib64 (or /opt/amd/aocl/lib32 for 32-bit) +# 3. ${LIB_INSTALL_DIR} +# +# Expected Library Names: +# ----------------------- +# Core MathLib: amdlibm, alm, almfast +# BLAS Single: blis +# BLAS Multi: blis-mt +# LAPACK: flame +# +# Dependencies: +# ------------- +# The module automatically links the following system libraries: +# - libm (standard math library) +# - libpthread (POSIX threads) +# - librt (real-time extensions) +# +# Architecture Support: +# --------------------- +# Optimized for AMD Zen family processors (Zen, Zen2, Zen3, Zen4, Zen5) +# with automatic architecture detection and SIMD instruction selection. +# +# Developer: +# ---------- +# Name: Sharad Saurabh Bhaskar +# Email: shbhaska@amd.com +# + +if(NOT DEFINED AOCL_ROOT) + if(DEFINED ENV{AOCL_ROOT}) + set(AOCL_ROOT $ENV{AOCL_ROOT}) + message(STATUS "AOCL_ROOT set from environment: ${AOCL_ROOT}") + else() + message(WARNING "AOCL_ROOT is not set. AOCL support will be disabled.") + set(AOCL_LIBRARIES "") + endif() +endif() + +if(AOCL_LIBRARIES) + set(AOCL_FIND_QUIETLY TRUE) +endif() + +# Determine default include directories +set(AOCL_INCLUDE_DIRS "") +if(AOCL_ROOT AND EXISTS "${AOCL_ROOT}/include") + list(APPEND AOCL_INCLUDE_DIRS "${AOCL_ROOT}/include") +endif() +if(EXISTS "/opt/amd/aocl/include") + list(APPEND AOCL_INCLUDE_DIRS "/opt/amd/aocl/include") +endif() + + if(${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "x86_64") + # Search for the core AOCL math library. + find_library(AOCL_CORE_LIB + NAMES amdlibm alm almfast + PATHS + ${AOCL_ROOT}/lib + /opt/amd/aocl/lib64 + ${LIB_INSTALL_DIR} + ) + if(AOCL_CORE_LIB) + message(STATUS "Found AOCL core library: ${AOCL_CORE_LIB}") + else() + message(WARNING "AOCL core library not found in ${AOCL_ROOT}/lib or default locations.") + endif() + + # Conditional BLIS library search based on MT requirement + if(EIGEN_AOCL_BENCH_USE_MT) + # Search for multithreaded BLIS first + find_library(AOCL_BLAS_LIB + NAMES blis-mt + PATHS + ${AOCL_ROOT}/lib + /opt/amd/aocl/lib64 + ${LIB_INSTALL_DIR} + ) + if(AOCL_BLAS_LIB) + message(STATUS "Found AOCL BLAS (MT) library: ${AOCL_BLAS_LIB}") + set(AOCL_BLAS_TYPE "multithreaded") + else() + message(WARNING "AOCL multithreaded BLAS library not found, falling back to single-threaded.") + find_library(AOCL_BLAS_LIB + NAMES blis + PATHS + ${AOCL_ROOT}/lib + /opt/amd/aocl/lib64 + ${LIB_INSTALL_DIR} + ) + set(AOCL_BLAS_TYPE "single-threaded") + endif() + else() + # Search for single-threaded BLIS + find_library(AOCL_BLAS_LIB + NAMES blis + PATHS + ${AOCL_ROOT}/lib + /opt/amd/aocl/lib64 + ${LIB_INSTALL_DIR} + ) + if(AOCL_BLAS_LIB) + message(STATUS "Found AOCL BLAS (ST) library: ${AOCL_BLAS_LIB}") + set(AOCL_BLAS_TYPE "single-threaded") + else() + message(WARNING "AOCL single-threaded BLAS library not found.") + endif() + endif() + + # Now search for AOCL LAPACK library. + find_library(AOCL_LAPACK_LIB + NAMES flame + PATHS + ${AOCL_ROOT}/lib + /opt/amd/aocl/lib64 + ${LIB_INSTALL_DIR} + ) + if(AOCL_LAPACK_LIB) + message(STATUS "Found AOCL LAPACK library: ${AOCL_LAPACK_LIB}") + else() + message(WARNING "AOCL LAPACK library not found in ${AOCL_ROOT}/lib or default locations.") + endif() + + else() + # For 32-bit systems, similar search paths. + find_library(AOCL_CORE_LIB + NAMES amdlibm alm almfast + PATHS + ${AOCL_ROOT}/lib + /opt/amd/aocl/lib32 + ${LIB_INSTALL_DIR} + ) + if(AOCL_CORE_LIB) + message(STATUS "Found AOCL core library: ${AOCL_CORE_LIB}") + else() + message(WARNING "AOCL core library not found in ${AOCL_ROOT}/lib or default locations.") + endif() + + # Conditional BLIS library search for 32-bit + if(EIGEN_AOCL_BENCH_USE_MT) + find_library(AOCL_BLAS_LIB + NAMES blis-mt + PATHS + ${AOCL_ROOT}/lib + /opt/amd/aocl/lib32 + ${LIB_INSTALL_DIR} + ) + if(AOCL_BLAS_LIB) + message(STATUS "Found AOCL BLAS (MT) library: ${AOCL_BLAS_LIB}") + set(AOCL_BLAS_TYPE "multithreaded") + else() + message(WARNING "AOCL multithreaded BLAS library not found, falling back to single-threaded.") + find_library(AOCL_BLAS_LIB + NAMES blis + PATHS + ${AOCL_ROOT}/lib + /opt/amd/aocl/lib32 + ${LIB_INSTALL_DIR} + ) + set(AOCL_BLAS_TYPE "single-threaded") + endif() + else() + find_library(AOCL_BLAS_LIB + NAMES blis + PATHS + ${AOCL_ROOT}/lib + /opt/amd/aocl/lib32 + ${LIB_INSTALL_DIR} + ) + if(AOCL_BLAS_LIB) + message(STATUS "Found AOCL BLAS (ST) library: ${AOCL_BLAS_LIB}") + set(AOCL_BLAS_TYPE "single-threaded") + else() + message(WARNING "AOCL single-threaded BLAS library not found.") + endif() + endif() + + find_library(AOCL_LAPACK_LIB + NAMES flame + PATHS + ${AOCL_ROOT}/lib + /opt/amd/aocl/lib32 + ${LIB_INSTALL_DIR} + ) + if(AOCL_LAPACK_LIB) + message(STATUS "Found AOCL LAPACK library: ${AOCL_LAPACK_LIB}") + else() + message(WARNING "AOCL LAPACK library not found in ${AOCL_ROOT}/lib or default locations.") + endif() +endif() + +# Combine the found libraries into one variable. +if(AOCL_CORE_LIB) + set(AOCL_LIBRARIES ${AOCL_CORE_LIB}) +endif() +if(AOCL_BLAS_LIB) + list(APPEND AOCL_LIBRARIES ${AOCL_BLAS_LIB}) +endif() +if(AOCL_LAPACK_LIB) + list(APPEND AOCL_LIBRARIES ${AOCL_LAPACK_LIB}) +endif() +if(AOCL_LIBRARIES) + # Link against the standard math and pthread libraries as well as librt + list(APPEND AOCL_LIBRARIES m pthread rt) +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(AOCL DEFAULT_MSG AOCL_LIBRARIES AOCL_INCLUDE_DIRS) +mark_as_advanced(AOCL_LIBRARIES AOCL_INCLUDE_DIRS) diff --git a/doc/UsingAOCL.dox b/doc/UsingAOCL.dox new file mode 100644 index 000000000..24ce69859 --- /dev/null +++ b/doc/UsingAOCL.dox @@ -0,0 +1,289 @@ +/* + Copyright (c) 2025, AMD Inc. All rights reserved. + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of AMD nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ******************************************************************************** + * Content : Documentation on the use of AMD AOCL through Eigen + ******************************************************************************** +*/ + +namespace Eigen { + +/** \page TopicUsingAOCL Using AMD® AOCL from %Eigen + +Since %Eigen version 3.4 and later, users can benefit from built-in AMD® Optimizing CPU Libraries (AOCL) optimizations with an installed copy of AOCL 5.0 (or later). + + AMD AOCL provides highly optimized, multi-threaded mathematical routines for x86-64 processors with a focus on AMD "Zen"-based architectures. AOCL is available on Linux and Windows for x86-64 architectures. + +\note +AMD® AOCL is freely available software, but it is the responsibility of users to download, install, and ensure their product's license allows linking to the AOCL libraries. AOCL is distributed under a permissive license that allows commercial use. + +Using AMD AOCL through %Eigen is straightforward: +-# export \c AOCL_ROOT into your environment +-# define one of the AOCL macros before including any %Eigen headers (see table below) +-# link your program to AOCL libraries (BLIS, FLAME, LibM) +-# ensure your system supports the target architecture optimizations + +When doing so, a number of %Eigen's algorithms are silently substituted with calls to AMD AOCL routines. +These substitutions apply only for \b Dynamic \b or \b large \b enough objects with one of the following standard scalar types: \c float, \c double, \c complex, and \c complex. +Operations on other scalar types or mixing reals and complexes will continue to use the built-in algorithms. + +The AOCL integration targets three core components: +- **BLIS**: High-performance BLAS implementation optimized for modern cache hierarchies +- **FLAME**: Dense linear algebra algorithms providing LAPACK functionality +- **LibM**: Optimized standard math routines with vectorized implementations + +\section TopicUsingAOCL_Macros Configuration Macros + +You can choose which parts will be substituted by defining one or multiple of the following macros: + + + + + + + + +
\c EIGEN_USE_BLAS Enables the use of external BLAS level 2 and 3 routines (AOCL-BLIS)
\c EIGEN_USE_LAPACKE Enables the use of external LAPACK routines via the LAPACKE C interface (AOCL-FLAME)
\c EIGEN_USE_LAPACKE_STRICT Same as \c EIGEN_USE_LAPACKE but algorithms of lower robustness are disabled. \n This currently concerns only JacobiSVD which would be replaced by \c gesvd.
\c EIGEN_USE_AOCL_VML Enables the use of AOCL LibM vector math operations for coefficient-wise functions
\c EIGEN_USE_AOCL_ALL Defines \c EIGEN_USE_BLAS, \c EIGEN_USE_LAPACKE, and \c EIGEN_USE_AOCL_VML
\c EIGEN_USE_AOCL_MT Equivalent to \c EIGEN_USE_AOCL_ALL, but ensures multi-threaded BLIS (\c libblis-mt) is used. \n \b Recommended for most applications.
+ +\note The AOCL integration automatically enables optimizations when the matrix/vector size exceeds \c EIGEN_AOCL_VML_THRESHOLD (default: 128 elements). For smaller operations, Eigen's built-in vectorization may be faster due to function call overhead. + +\section TopicUsingAOCL_Performance Performance Considerations + +The \c EIGEN_USE_BLAS and \c EIGEN_USE_LAPACKE macros can be combined with AOCL-specific optimizations: + +- **Multi-threading**: Use \c EIGEN_USE_AOCL_MT to automatically select the multi-threaded BLIS library +- **Architecture targeting**: AOCL libraries are optimized for AMD Zen architectures (Zen, Zen2, Zen3, Zen4, Zen5) +- **Vector Math Library**: AOCL LibM provides vectorized implementations that can operate on entire arrays simultaneously +- **Memory layout**: Eigen's column-major storage directly matches AOCL's expected data layout for zero-copy operation + +\section TopicUsingAOCL_Types Supported Data Types and Sizes + +AOCL acceleration is applied to: +- **Scalar types**: \c float, \c double, \c complex, \c complex +- **Matrix/Vector sizes**: Dynamic size or compile-time size ≥ \c EIGEN_AOCL_VML_THRESHOLD +- **Storage order**: Both column-major (default) and row-major layouts +- **Memory alignment**: Eigen's data pointers are directly compatible with AOCL function signatures + +The current AOCL Vector Math Library integration is specialized for \c double precision, with automatic fallback to scalar implementations for \c float. + +\section TopicUsingAOCL_Functions Vector Math Functions + +The following table summarizes coefficient-wise operations accelerated by \c EIGEN_USE_AOCL_VML: + + + + +
Code exampleAOCL routines
\code +v2 = v1.array().exp(); +v2 = v1.array().sin(); +v2 = v1.array().cos(); +v2 = v1.array().tan(); +v2 = v1.array().log(); +v2 = v1.array().log10(); +v2 = v1.array().log2(); +v2 = v1.array().sqrt(); +v2 = v1.array().pow(1.5); +v2 = v1.array() + v2.array(); +\endcode\code +amd_vrda_exp +amd_vrda_sin +amd_vrda_cos +amd_vrda_tan +amd_vrda_log +amd_vrda_log10 +amd_vrda_log2 +amd_vrda_sqrt +amd_vrda_pow +amd_vrda_add +\endcode
+ +In the examples, v1 and v2 are dense vectors of type \c VectorXd with size ≥ \c EIGEN_AOCL_VML_THRESHOLD. + +\section TopicUsingAOCL_Example Complete Example + +\code +#define EIGEN_USE_AOCL_MT +#include +#include + +int main() { + const int n = 2048; + + // Large matrices automatically use AOCL-BLIS for multiplication + Eigen::MatrixXd A = Eigen::MatrixXd::Random(n, n); + Eigen::MatrixXd B = Eigen::MatrixXd::Random(n, n); + Eigen::MatrixXd C = A * B; // Dispatched to dgemm + + // Large vectors automatically use AOCL LibM for math functions + Eigen::VectorXd v = Eigen::VectorXd::LinSpaced(10000, 0, 10); + Eigen::VectorXd result = v.array().sin(); // Dispatched to amd_vrda_sin + + // LAPACK decompositions use AOCL-FLAME + Eigen::LLT llt(A); // Dispatched to dpotrf + + std::cout << "Matrix norm: " << C.norm() << std::endl; + std::cout << "Vector result norm: " << result.norm() << std::endl; + + return 0; +} +\endcode + +\section TopicUsingAOCL_Building Building and Linking + +To compile with AOCL support, set the \c AOCL_ROOT environment variable and link against the required libraries: + +\code +export AOCL_ROOT=/path/to/aocl +clang++ -O3 -g -DEIGEN_USE_AOCL_ALL \ + -I./install/include -I${AOCL_ROOT}/include \ + -Wno-parentheses my_app.cpp \ + -L${AOCL_ROOT} -lamdlibm -lflame -lblis \ + -lpthread -lrt -lm -lomp \ + -o eigen_aocl_example +\endcode + +For multi-threaded performance, use the multi-threaded BLIS library: +\code +clang++ -O3 -g -DEIGEN_USE_AOCL_MT \ + -I./install/include -I${AOCL_ROOT}/include \ + -Wno-parentheses my_app.cpp \ + -L${AOCL_ROOT} -lamdlibm -lflame -lblis-mt \ + -lpthread -lrt -lm -lomp \ + -o eigen_aocl_example +\endcode + +Key compiler and linker flags: +- \c -DEIGEN_USE_AOCL_ALL: Enable all AOCL accelerations (BLAS, LAPACK, VML) +- \c -DEIGEN_USE_AOCL_MT: Enable multi-threaded version (uses \c -lblis-mt) +- \c -lblis: Single-threaded BLIS library +- \c -lblis-mt: Multi-threaded BLIS library (recommended for performance) +- \c -lflame: FLAME LAPACK implementation +- \c -lamdlibm: AMD LibM vector math library +- \c -lomp: OpenMP runtime for multi-threading support +- \c -lpthread -lrt: System threading and real-time libraries +- \c -Wno-parentheses: Suppress common warnings when using AOCL headers + +\subsection TopicUsingAOCL_EigenBuild Building Eigen with AOCL Support + +To build Eigen with AOCL Support, use the following CMake configuration: + +\code +cmake .. -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER=clang \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_INSTALL_PREFIX=$PWD/install \ + -DINCLUDE_INSTALL_DIR=$PWD/install/include \ + && make install -j$(nproc) +\endcode + + +To build Eigen with AOCL integration and benchmarking capabilities, use the following CMake configuration: + +\code +cmake .. -DEIGEN_BUILD_AOCL_BENCH=ON \ + -DEIGEN_AOCL_BENCH_FLAGS="-O3 -mavx512f -fveclib=AMDLIBM" \ + -DEIGEN_AOCL_BENCH_USE_MT=OFF \ + -DEIGEN_AOCL_BENCH_ARCH=znver5 \ + -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_C_COMPILER=clang \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_INSTALL_PREFIX=$PWD/install \ + -DINCLUDE_INSTALL_DIR=$PWD/install/include \ + && make install -j$(nproc) +\endcode + +**CMake Configuration Parameters:** + + + + + + + + + + + + +
ParameterExpected ValuesDescription
\c EIGEN_BUILD_AOCL_BENCH\c ON, \c OFFEnable/disable AOCL benchmark compilation
\c EIGEN_AOCL_BENCH_FLAGSCompiler flags stringAdditional compiler optimizations: \c "-O3 -mavx512f -fveclib=AMDLIBM"
\c EIGEN_AOCL_BENCH_USE_MT\c ON, \c OFFUse multi-threaded AOCL libraries (\c ON recommended for performance)
\c EIGEN_AOCL_BENCH_ARCH\c znver3, \c znver4, \c znver5, \c native, \c genericTarget AMD architecture (match your CPU generation)
\c CMAKE_BUILD_TYPE\c Release, \c Debug, \c RelWithDebInfoBuild configuration (\c Release recommended for benchmarks)
\c CMAKE_C_COMPILER\c clang, \c gccC compiler (clang recommended for AOCL)
\c CMAKE_CXX_COMPILER\c clang++, \c g++C++ compiler (clang++ recommended for AOCL)
\c CMAKE_INSTALL_PREFIXInstallation pathWhere to install Eigen headers
\c INCLUDE_INSTALL_DIRHeader pathSpecific path for Eigen headers
+ +**Architecture Selection Guide:** +- \c znver3: AMD Zen 3 (EPYC 7003, Ryzen 5000 series) +- \c znver4: AMD Zen 4 (EPYC 9004, Ryzen 7000 series) +- \c znver5: AMD Zen 5 (EPYC 9005, Ryzen 9000 series) +- \c native: Auto-detect current CPU architecture +- \c generic: Generic x86-64 without specific optimizations + +**Custom Compiler Flags Explanation:** +- \c -O3: Maximum optimization level +- \c -mavx512f: Enable AVX-512 instruction set (if supported) +- \c -fveclib=AMDLIBM: Use AMD LibM for vectorized math functions + +\subsection TopicUsingAOCL_Benchmark Building the AOCL Benchmark + +After configuring Eigen, build the AOCL benchmark executable: + +\code +cmake --build . --target benchmark_aocl -j$(nproc) +\endcode + +This creates the \c benchmark_aocl executable that demonstrates AOCL acceleration with various matrix sizes and operations. + +**Running the Benchmark:** +\code +./benchmark_aocl +\endcode + +The benchmark will automatically compare: +- Eigen's native performance vs AOCL-accelerated operations +- Matrix multiplication performance (BLIS vs Eigen) +- Vector math functions performance (LibM vs Eigen) +- Memory bandwidth utilization and cache efficiency + +\section TopicUsingAOCL_CMake CMake Integration + +When using CMake, you can use a FindAOCL module: + +\code +find_package(AOCL REQUIRED) +target_compile_definitions(my_target PRIVATE EIGEN_USE_AOCL_MT) +target_link_libraries(my_target PRIVATE AOCL::BLIS_MT AOCL::FLAME AOCL::LIBM) +\endcode + +\section TopicUsingAOCL_Troubleshooting Troubleshooting + +Common issues and solutions: + +- **Link errors**: Ensure \c AOCL_ROOT is set and libraries are in \c LD_LIBRARY_PATH +- **Performance not improved**: Verify you're using matrices/vectors larger than the threshold +- **Thread contention**: Set \c OMP_NUM_THREADS to match your CPU core count +- **Architecture mismatch**: Use appropriate \c -march flag for your AMD processor + +\section TopicUsingAOCL_Links Links + +- AMD AOCL can be downloaded for free here +- AOCL User Guide and documentation available on the AMD Developer Portal +- AOCL is also available through package managers and containerized environments + +*/ + +}