Aocl integration updated

libeigen/eigen!1952
2026-01-18 17:31:19 +01:00 · 2025-11-24 17:20:42 +00:00
parent a6630c53c1
commit 8a1083e9bf
8 changed files with 1527 additions and 8 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -71,6 +71,7 @@ endif()

 option(EIGEN_BUILD_BTL "Build benchmark suite" OFF)
 option(EIGEN_BUILD_SPBENCH "Build sparse benchmark suite" OFF)
+option(EIGEN_BUILD_AOCL_BENCH "Build AOCL benchmark" OFF)
 # Avoid building docs if included from another project.
 # Building documentation requires creating and running executables on the host
 # platform.  We shouldn't do this if cross-compiling.
@@ -305,17 +306,30 @@ if (EIGEN_IS_BUILDING_)
  set(CMAKE_INCLUDE_CURRENT_DIR OFF)

  find_package(StandardMathLibrary)
+  cmake_policy(SET CMP0074 NEW)
+  find_package(AOCL QUIET)
  set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "")
-  if(NOT STANDARD_MATH_LIBRARY_FOUND)
-    message(FATAL_ERROR
-      "Can't link to the standard math library. Please report to the Eigen developers, telling them about your platform.")
-  else()
-    if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
-      set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO} ${STANDARD_MATH_LIBRARY}")
-    else()
-      set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${STANDARD_MATH_LIBRARY}")
+  if(AOCL_FOUND)
+    list(APPEND EIGEN_STANDARD_LIBRARIES_TO_LINK_TO ${AOCL_LIBRARIES})
+    if(AOCL_INCLUDE_DIRS)
+      include_directories(${AOCL_INCLUDE_DIRS})
    endif()
  endif()
+
+  if(NOT STANDARD_MATH_LIBRARY_FOUND)
+  message(FATAL_ERROR
+    "Can't link to the standard math library. Please report to the Eigen developers, telling them about your platform.")
+  else()
+  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
+    set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO} ${STANDARD_MATH_LIBRARY}")
+  else()
+    set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${STANDARD_MATH_LIBRARY}")
+  endif()
+  # Clean up any leading/trailing whitespace in the variable to avoid CMP0004 errors
+  string(STRIP "${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}" EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
+ endif()
+
+
  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
    message(STATUS "Standard libraries to link to explicitly: ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}")
  else()
@@ -743,6 +757,57 @@ endif()
 if(NOT WIN32 AND EIGEN_BUILD_SPBENCH)
  add_subdirectory(bench/spbench EXCLUDE_FROM_ALL)
 endif()
+#--------------------------------------------------------------------------------------#
+#                      AOCL BENCHMARK BUILD SECTION                                    #
+#--------------------------------------------------------------------------------------#
+if(EIGEN_BUILD_AOCL_BENCH)
+  # Allow users to override the default architecture
+  set(EIGEN_AOCL_BENCH_ARCH "znver5" CACHE STRING "Target architecture for AOCL benchmark")
+  add_executable(benchmark_aocl EXCLUDE_FROM_ALL bench/benchmark_aocl.cpp)
+  include(CheckCXXCompilerFlag)
+  check_cxx_compiler_flag("-march=${EIGEN_AOCL_BENCH_ARCH}" COMPILER_SUPPORTS_AOCL_ARCH)
+  if(COMPILER_SUPPORTS_AOCL_ARCH)
+    target_compile_options(benchmark_aocl PRIVATE -O3 -Wno-shadow -march=${EIGEN_AOCL_BENCH_ARCH})
+  else()
+    message(WARNING "${EIGEN_AOCL_BENCH_ARCH} architecture not supported by compiler")
+    target_compile_options(benchmark_aocl PRIVATE -O3)
+  endif()
+
+  # Add custom flags if provided
+  if(EIGEN_AOCL_BENCH_FLAGS)
+    separate_arguments(CUSTOM_FLAGS NATIVE_COMMAND "${EIGEN_AOCL_BENCH_FLAGS}")
+    target_compile_options(benchmark_aocl PRIVATE ${CUSTOM_FLAGS})
+    # Check if OpenMP is requested in custom flags and link it
+    string(FIND "${EIGEN_AOCL_BENCH_FLAGS}" "-fopenmp" OPENMP_REQUESTED)
+    if(NOT OPENMP_REQUESTED EQUAL -1)
+      find_package(OpenMP)
+      if(OpenMP_CXX_FOUND)
+        target_link_libraries(benchmark_aocl OpenMP::OpenMP_CXX)
+      else()
+        # Generic fallback: let compiler handle OpenMP linking
+        if(MSVC)
+          target_compile_options(benchmark_aocl PRIVATE "/openmp")
+        else()
+          target_compile_options(benchmark_aocl PRIVATE "-fopenmp")
+          target_link_options(benchmark_aocl PRIVATE "-fopenmp")
+        endif()
+        message(STATUS "Using compiler OpenMP flags as fallback")
+      endif()
+    endif()
+  endif()
+
+  target_include_directories(benchmark_aocl PRIVATE ${INCLUDE_INSTALL_DIR})
+  if(EIGEN_AOCL_BENCH_USE_MT)
+    target_compile_definitions(benchmark_aocl PRIVATE EIGEN_USE_AOCL_MT)
+  else()
+    target_compile_definitions(benchmark_aocl PRIVATE EIGEN_USE_AOCL_ALL)
+  endif()
+  target_link_libraries(benchmark_aocl Eigen3::Eigen)
+  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
+    target_link_libraries(benchmark_aocl ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
+  endif()
+endif()
+#----------------------------------------------------------------------------------------#

 if (EIGEN_BUILD_DEMOS)
  add_subdirectory(demos EXCLUDE_FROM_ALL)
@@ -792,6 +857,9 @@ if(PROJECT_IS_TOP_LEVEL)
  if (EIGEN_BUILD_LAPACK)
    message(STATUS "lapack      | Build LAPACK subset library (not the same thing as Eigen)")
  endif()
+  if(EIGEN_BUILD_AOCL_BENCH)
+    message(STATUS "benchmark_aocl | Build AOCL benchmark executable")
+  endif()
  message(STATUS "------------+--------------------------------------------------------------")
  message(STATUS "")
 endif()
@@ -799,3 +867,4 @@ endif()
 message(STATUS "")
 message(STATUS "Configured Eigen ${EIGEN_VERSION_STRING}")
 message(STATUS "")
+
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -53,6 +53,8 @@
 // this include file manages BLAS and MKL related macros
 // and inclusion of their respective header files
 #include "src/Core/util/MKL_support.h"
+#include "src/Core/util/AOCL_Support.h"    // ← ADD THIS
+

 #if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
 #define EIGEN_HAS_GPU_FP16
@@ -463,6 +465,10 @@ using std::ptrdiff_t;
 #include "src/Core/Assign_MKL.h"
 #endif

+#ifdef EIGEN_USE_AOCL_VML
+#include "src/Core/Assign_AOCL.h"
+#endif
+
 #include "src/Core/GlobalFunctions.h"
 // IWYU pragma: end_exports

--- a/Eigen/src/Core/Assign_AOCL.h
+++ b/Eigen/src/Core/Assign_AOCL.h
@@ -0,0 +1,301 @@
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ *
+ * Assign_AOCL.h - AOCL Vectorized Math Dispatch Layer for Eigen
+ *
+ * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Description:
+ * ------------
+ * This file implements a high-performance dispatch layer that automatically
+ * routes Eigen's element-wise mathematical operations to AMD Optimizing CPU
+ * Libraries (AOCL) Vector Math Library (VML) functions when beneficial for
+ * performance.
+ *
+ * The dispatch system uses C++ template specialization to intercept Eigen's
+ * assignment operations and redirect them to AOCL's VRDA functions, which
+ * provide optimized implementations for AMD Zen architectures.
+ *
+ * Key Features:
+ * -------------
+ * 1. Automatic Dispatch: Seamlessly routes supported operations to AOCL without
+ *    requiring code changes in user applications
+ *
+ * 2. Performance Optimization: Uses AOCL VRDA functions optimized for Zen
+ * family processors with automatic SIMD instruction selection (AVX2, AVX-512)
+ *
+ * 3. Threshold-Based Activation: Only activates for vectors larger than
+ *    EIGEN_AOCL_VML_THRESHOLD (default: 128 elements) to avoid overhead on
+ * small vectors
+ *
+ * 4. Precision-Specific Handling:
+ *    - Double precision: AOCL VRDA vectorized functions
+ *    - Single precision: Scalar fallback (preserves correctness)
+ *
+ * 5. Memory Layout Compatibility: Ensures direct memory access and compatible
+ *    storage orders between source and destination for optimal performance
+ *
+ * Supported Operations:
+ * ---------------------
+ * UNARY OPERATIONS (vector → vector):
+ * - Transcendental: exp(), sin(), cos(), sqrt(), log(), log10(), log2()
+ *
+ * BINARY OPERATIONS (vector op vector → vector):
+ * - Arithmetic: +, *, pow()
+ *
+ * Template Specialization Mechanism:
+ * -----------------------------------
+ * The system works by specializing Eigen's Assignment template for:
+ * 1. CwiseUnaryOp with scalar_*_op functors (unary operations)
+ * 2. CwiseBinaryOp with scalar_*_op functors (binary operations)
+ * 3. Dense2Dense assignment context with AOCL-compatible traits
+ *
+ * Dispatch conditions (all must be true):
+ * - Source and destination have DirectAccessBit (contiguous memory)
+ * - Compatible storage orders (both row-major or both column-major)
+ * - Vector size ≥ EIGEN_AOCL_VML_THRESHOLD or Dynamic size
+ * - Supported data type (currently double precision for VRDA)
+ *
+ * Integration Example:
+ * --------------------
+ * // Standard Eigen code - no changes required
+ * VectorXd x = VectorXd::Random(10000);
+ * VectorXd y = VectorXd::Random(10000);
+ * VectorXd result;
+ *
+ * // These operations are automatically dispatched to AOCL:
+ * result = x.array().exp();              // → amd_vrda_exp()
+ * result = x.array().sin();              // → amd_vrda_sin()
+ * result = x.array() + y.array();        // → amd_vrda_add()
+ * result = x.array().pow(y.array());     // → amd_vrda_pow()
+ *
+ * Configuration:
+ * --------------
+ * Required preprocessor definitions:
+ * - EIGEN_USE_AOCL_ALL or EIGEN_USE_AOCL_MT: Enable AOCL integration
+ * - EIGEN_USE_AOCL_VML: Enable Vector Math Library dispatch
+ *
+ * Compilation Requirements:
+ * -------------------------
+ * Include paths:
+ * - AOCL headers: -I${AOCL_ROOT}/include
+ * - Eigen headers: -I/path/to/eigen
+ *
+ * Link libraries:
+ * - AOCL MathLib: -lamdlibm
+ * - Standard math: -lm
+ *
+ * Compiler flags:
+ * - Optimization: -O3 (required for inlining)
+ * - Architecture: -march=znver5 or -march=native
+ * - Vectorization: -mfma -mavx512f (if supported)
+ *
+ * Platform Support:
+ * ------------------
+ * - Primary: Linux x86_64 with AMD Zen family processors
+ * - Compilers: GCC 8+, Clang 10+, AOCC (recommended)
+ * - AOCL Version: 4.0+ (with VRDA support)
+ *
+ * Error Handling:
+ * ---------------
+ * - Graceful fallback to scalar operations for unsupported configurations
+ * - Compile-time detection of AOCL availability
+ * - Runtime size and alignment validation with eigen_assert()
+ *
+ * Developer:
+ * ----------
+ * Name: Sharad Saurabh Bhaskar
+ * Email: shbhaska@amd.com
+ * Organization: Advanced Micro Devices, Inc.
+ */
+
+
+#ifndef EIGEN_ASSIGN_AOCL_H
+#define EIGEN_ASSIGN_AOCL_H
+
+namespace Eigen {
+namespace internal {
+
+// Traits for unary operations.
+template <typename Dst, typename Src> class aocl_assign_traits {
+private:
+  enum {
+    DstHasDirectAccess = !!(Dst::Flags & DirectAccessBit),
+    SrcHasDirectAccess = !!(Src::Flags & DirectAccessBit),
+    StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Src::IsRowMajor)),
+    InnerSize = Dst::IsVectorAtCompileTime   ? int(Dst::SizeAtCompileTime)
+                : (Dst::Flags & RowMajorBit) ? int(Dst::ColsAtCompileTime)
+                                             : int(Dst::RowsAtCompileTime),
+    LargeEnough =
+        (InnerSize == Dynamic) || (InnerSize >= EIGEN_AOCL_VML_THRESHOLD)
+  };
+
+public:
+  enum {
+    EnableAoclVML = DstHasDirectAccess && SrcHasDirectAccess &&
+                    StorageOrdersAgree && LargeEnough,
+    Traversal = LinearTraversal
+  };
+};
+
+// Traits for binary operations (e.g., add, pow).
+template <typename Dst, typename Lhs, typename Rhs>
+class aocl_assign_binary_traits {
+private:
+  enum {
+    DstHasDirectAccess = !!(Dst::Flags & DirectAccessBit),
+    LhsHasDirectAccess = !!(Lhs::Flags & DirectAccessBit),
+    RhsHasDirectAccess = !!(Rhs::Flags & DirectAccessBit),
+    StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Lhs::IsRowMajor)) &&
+                         (int(Dst::IsRowMajor) == int(Rhs::IsRowMajor)),
+    InnerSize = Dst::IsVectorAtCompileTime   ? int(Dst::SizeAtCompileTime)
+                : (Dst::Flags & RowMajorBit) ? int(Dst::ColsAtCompileTime)
+                                             : int(Dst::RowsAtCompileTime),
+    LargeEnough =
+        (InnerSize == Dynamic) || (InnerSize >= EIGEN_AOCL_VML_THRESHOLD)
+  };
+
+public:
+  enum {
+    EnableAoclVML = DstHasDirectAccess && LhsHasDirectAccess &&
+                    RhsHasDirectAccess && StorageOrdersAgree && LargeEnough
+  };
+};
+
+// Unary operation dispatch for float (scalar fallback).
+#define EIGEN_AOCL_VML_UNARY_CALL_FLOAT(EIGENOP)                               \
+  template <typename DstXprType, typename SrcXprNested>                        \
+  struct Assignment<                                                           \
+      DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<float>, SrcXprNested>,    \
+      assign_op<float, float>, Dense2Dense,                                    \
+      std::enable_if_t<                                                        \
+          aocl_assign_traits<DstXprType, SrcXprNested>::EnableAoclVML>> {      \
+    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<float>, SrcXprNested>           \
+        SrcXprType;                                                            \
+    static void run(DstXprType &dst, const SrcXprType &src,                    \
+                    const assign_op<float, float> &) {                         \
+      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());      \
+      Eigen::Index n = dst.size();                                             \
+      if (n <= 0)                                                              \
+        return;                                                                \
+      const float *input =                                                     \
+          reinterpret_cast<const float *>(src.nestedExpression().data());      \
+      float *output = reinterpret_cast<float *>(dst.data());                   \
+      for (Eigen::Index i = 0; i < n; ++i) {                                   \
+        output[i] = std::EIGENOP(input[i]);                                    \
+      }                                                                        \
+    }                                                                          \
+  };
+
+// Unary operation dispatch for double (AOCL vectorized).
+#define EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(EIGENOP, AOCLOP)                      \
+  template <typename DstXprType, typename SrcXprNested>                        \
+  struct Assignment<                                                           \
+      DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<double>, SrcXprNested>,   \
+      assign_op<double, double>, Dense2Dense,                                  \
+      std::enable_if_t<                                                        \
+          aocl_assign_traits<DstXprType, SrcXprNested>::EnableAoclVML>> {      \
+    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<double>, SrcXprNested>          \
+        SrcXprType;                                                            \
+    static void run(DstXprType &dst, const SrcXprType &src,                    \
+                    const assign_op<double, double> &) {                       \
+      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());      \
+      Eigen::Index n = dst.size();                                             \
+      eigen_assert(n <= INT_MAX && "AOCL does not support arrays larger than INT_MAX"); \
+      if (n <= 0)                                                              \
+        return;                                                                \
+      const double *input =                                                    \
+          reinterpret_cast<const double *>(src.nestedExpression().data());     \
+      double *output = reinterpret_cast<double *>(dst.data());                 \
+      int aocl_n = internal::convert_index<int>(n);                            \
+      AOCLOP(aocl_n, const_cast<double *>(input), output);                     \
+    }                                                                          \
+  };
+
+// Instantiate unary calls for float (scalar).
+// EIGEN_AOCL_VML_UNARY_CALL_FLOAT(exp)
+
+// Instantiate unary calls for double (AOCL vectorized).
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(exp2, amd_vrda_exp2)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(exp, amd_vrda_exp)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(sin, amd_vrda_sin)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(cos, amd_vrda_cos)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(sqrt, amd_vrda_sqrt)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(cbrt, amd_vrda_cbrt)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(abs, amd_vrda_fabs)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(log, amd_vrda_log)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(log10, amd_vrda_log10)
+EIGEN_AOCL_VML_UNARY_CALL_DOUBLE(log2, amd_vrda_log2)
+
+// Binary operation dispatch for float (scalar fallback).
+#define EIGEN_AOCL_VML_BINARY_CALL_FLOAT(EIGENOP, STDFUNC)                     \
+  template <typename DstXprType, typename LhsXprNested, typename RhsXprNested> \
+  struct Assignment<                                                           \
+      DstXprType,                                                              \
+      CwiseBinaryOp<scalar_##EIGENOP##_op<float, float>, LhsXprNested,         \
+                    RhsXprNested>,                                             \
+      assign_op<float, float>, Dense2Dense,                                    \
+      std::enable_if_t<aocl_assign_binary_traits<                              \
+          DstXprType, LhsXprNested, RhsXprNested>::EnableAoclVML>> {           \
+    typedef CwiseBinaryOp<scalar_##EIGENOP##_op<float, float>, LhsXprNested,   \
+                          RhsXprNested>                                        \
+        SrcXprType;                                                            \
+    static void run(DstXprType &dst, const SrcXprType &src,                    \
+                    const assign_op<float, float> &) {                         \
+      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());      \
+      Eigen::Index n = dst.size();                                             \
+      if (n <= 0)                                                              \
+        return;                                                                \
+      const float *lhs = reinterpret_cast<const float *>(src.lhs().data());    \
+      const float *rhs = reinterpret_cast<const float *>(src.rhs().data());    \
+      float *output = reinterpret_cast<float *>(dst.data());                   \
+      for (Eigen::Index i = 0; i < n; ++i) {                                   \
+        output[i] = STDFUNC(lhs[i], rhs[i]);                                   \
+      }                                                                        \
+    }                                                                          \
+  };
+
+// Binary operation dispatch for double (AOCL vectorized).
+#define EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(EIGENOP, AOCLOP)                     \
+  template <typename DstXprType, typename LhsXprNested, typename RhsXprNested> \
+  struct Assignment<                                                           \
+      DstXprType,                                                              \
+      CwiseBinaryOp<scalar_##EIGENOP##_op<double, double>, LhsXprNested,       \
+                    RhsXprNested>,                                             \
+      assign_op<double, double>, Dense2Dense,                                  \
+      std::enable_if_t<aocl_assign_binary_traits<                              \
+          DstXprType, LhsXprNested, RhsXprNested>::EnableAoclVML>> {           \
+    typedef CwiseBinaryOp<scalar_##EIGENOP##_op<double, double>, LhsXprNested, \
+                          RhsXprNested>                                        \
+        SrcXprType;                                                            \
+    static void run(DstXprType &dst, const SrcXprType &src,                    \
+                    const assign_op<double, double> &) {                       \
+      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());      \
+      Eigen::Index n = dst.size();                                             \
+      eigen_assert(n <= INT_MAX && "AOCL does not support arrays larger than INT_MAX"); \
+      if (n <= 0)                                                              \
+        return;                                                                \
+      const double *lhs = reinterpret_cast<const double *>(src.lhs().data());  \
+      const double *rhs = reinterpret_cast<const double *>(src.rhs().data());  \
+      double *output = reinterpret_cast<double *>(dst.data());                 \
+      int aocl_n = internal::convert_index<int>(n);                            \
+      AOCLOP(aocl_n, const_cast<double *>(lhs), const_cast<double *>(rhs), output); \
+    }                                                                          \
+  };
+
+// Instantiate binary calls for float (scalar).
+// EIGEN_AOCL_VML_BINARY_CALL_FLOAT(sum, std::plus<float>)  // Using
+// scalar_sum_op for addition EIGEN_AOCL_VML_BINARY_CALL_FLOAT(pow, std::pow)
+
+// Instantiate binary calls for double (AOCL vectorized).
+EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(sum, amd_vrda_add) // Using scalar_sum_op for addition
+EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(pow, amd_vrda_pow)
+EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(max, amd_vrda_fmax)
+EIGEN_AOCL_VML_BINARY_CALL_DOUBLE(min, amd_vrda_fmin)
+
+} // namespace internal
+} // namespace Eigen
+
+#endif // EIGEN_ASSIGN_AOCL_H
--- a/Eigen/src/Core/util/AOCL_Support.h
+++ b/Eigen/src/Core/util/AOCL_Support.h
@@ -0,0 +1,175 @@
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ *
+ * AOCL_Support.h - AMD Optimizing CPU Libraries Integration Header for Eigen
+ *
+ * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Description:
+ * ------------
+ * This header file serves as the central configuration and integration point
+ * for AMD Optimizing CPU Libraries (AOCL) with the Eigen C++ template library.
+ * It orchestrates the integration of multiple AOCL components to provide
+ * optimal mathematical computing performance on AMD Zen family processors.
+ *
+ * AOCL Component Integration:
+ * ---------------------------
+ * 1. AOCL Vector Math Library (VML):
+ *    - Provides VRDA (Vector Rapid Double-precision Arithmetic) functions
+ *    - Optimized transcendental functions: exp, sin, cos, sqrt, log, pow, etc.
+ *    - SIMD vectorization for AMD architectures (AVX2, AVX-512)
+ *    - Headers: amdlibm.h, amdlibm_vec.h
+ *
+ * 2. AOCL BLAS (BLIS - BLAS-like Library Instantiation Software):
+ *    - High-performance Basic Linear Algebra Subprograms
+ *    - Supports single-threaded (libblis) and multithreaded (libblis-mt)
+ * variants
+ *    - Optimized matrix operations: GEMM, GEMV, TRSM, etc.
+ *    - Headers: cblas.h, blis.h
+ *
+ * 3. AOCL LAPACK (libFLAME - Formal Linear Algebra Methods Environment):
+ *    - Dense linear algebra operations: factorizations, eigenvalue solvers
+ *    - Matrix decompositions: LU, Cholesky, QR, SVD
+ *    - Eigenvalue/eigenvector computations optimized for AMD hardware
+ *    - Headers: LAPACKE interface
+ *
+ * ------------------------------
+ * EIGEN_AOCL_VML_THRESHOLD (default: 128):
+ *   - Minimum vector size for AOCL VML dispatch
+ *   - Smaller vectors use standard Eigen to avoid function call overhead
+ *   - Optimal values: 64-512 depending on operation and data characteristics
+ *
+ *
+ *
+ * Architecture Support:
+ * ---------------------
+ * Optimized for AMD processor families:
+ * - Zen Architecture (Naples, Rome): AVX2 optimization
+ * - Zen 2 Architecture (Rome, Matisse): Enhanced AVX2
+ * - Zen 3 Architecture (Milan, Vermeer): Improved IPC and cache
+ * - Zen 4 Architecture (Genoa, Raphael): AVX-512 support
+ * - Zen 5 Architecture (Turin, Granite Ridge): Enhanced AVX-512
+ *
+ *
+ * Dependencies:
+ * -------------
+ * Required AOCL components:
+ * - libamdlibm: Core math library with VRDA functions
+ * - libblis or libblis-mt: BLAS implementation
+ * - libflame: LAPACK implementation
+ *
+ * System requirements:
+ * - AMD x86_64 processor (optimal performance)
+ * - Linux, Windows, or compatible POSIX system
+ * - C++11 or later standard
+ * - CMake 3.5+ for build system integration
+ *
+ * Developer:
+ * ----------
+ * Name: Sharad Saurabh Bhaskar
+ * Email: shbhaska@amd.com
+ * Organization: Advanced Micro Devices, Inc.
+ */
+
+#ifndef EIGEN_AOCL_SUPPORT_H
+#define EIGEN_AOCL_SUPPORT_H
+
+#if defined(EIGEN_USE_AOCL_ALL) || defined(EIGEN_USE_AOCL_MT)
+
+#include <complex>
+
+// Define AOCL component flags based on main flags
+#ifdef EIGEN_USE_AOCL_ALL
+#define EIGEN_USE_AOCL_VML  // Enable AOCL Vector Math Library
+#define EIGEN_USE_AOCL_BLAS // Enable AOCL BLAS (BLIS)
+
+// Enable Eigen BLAS backend only if BLIS provides compatible interface
+#if defined(EIGEN_AOCL_BLIS_COMPATIBLE)
+#define EIGEN_USE_BLAS // Enable Eigen BLAS backend
+#endif
+
+#define EIGEN_USE_LAPACKE // Enable LAPACK backend (FLAME)
+#endif
+
+#ifdef EIGEN_USE_AOCL_MT
+#define EIGEN_USE_AOCL_VML  // Enable AOCL Vector Math Library
+#define EIGEN_USE_AOCL_BLAS // Enable AOCL BLAS (BLIS)
+
+// For multithreaded: disable EIGEN_USE_BLAS to avoid signature conflicts
+// Use direct BLIS calls instead through EIGEN_USE_AOCL_BLAS
+// #define EIGEN_USE_BLAS       // Commented out - causes conflicts with BLIS
+// interface
+
+// Note: LAPACKE disabled in MT mode to avoid header conflicts
+#define EIGEN_USE_LAPACKE // Commented out - causes conflicts with BLIS LAPACKE
+#define EIGEN_AOCL_USE_BLIS_MT 1 // Enable multithreaded BLIS
+#endif
+
+// Handle standalone EIGEN_USE_AOCL_VML flag
+#ifndef EIGEN_USE_AOCL_VML
+#ifdef EIGEN_USE_AOCL_ALL
+#define EIGEN_USE_AOCL_VML
+#endif
+#ifdef EIGEN_USE_AOCL_MT
+#define EIGEN_USE_AOCL_VML
+#endif
+#endif
+
+// Configuration constants - define these for any AOCL usage
+#ifndef EIGEN_AOCL_VML_THRESHOLD
+#define EIGEN_AOCL_VML_THRESHOLD 128 // Threshold for VML dispatch
+#endif
+
+#ifndef AOCL_SIMD_WIDTH
+#define AOCL_SIMD_WIDTH 8 // AVX-512: 512 bits / 64 bits per double
+#endif
+
+// Include AOCL Math Library headers for VML
+#if defined(EIGEN_USE_AOCL_VML) || defined(EIGEN_USE_AOCL_ALL) ||              \
+    defined(EIGEN_USE_AOCL_MT)
+#if defined(__has_include)
+#if __has_include("amdlibm.h")
+#include "amdlibm.h"
+#ifndef AMD_LIBM_VEC_EXPERIMENTAL
+#define AMD_LIBM_VEC_EXPERIMENTAL
+#endif
+#if __has_include("amdlibm_vec.h")
+#include "amdlibm_vec.h"
+#endif
+#endif
+#else
+// Fallback for compilers without __has_include
+#include "amdlibm.h"
+#ifndef AMD_LIBM_VEC_EXPERIMENTAL
+#define AMD_LIBM_VEC_EXPERIMENTAL
+#endif
+#include "amdlibm_vec.h"
+#endif
+#endif
+
+// Include CBLAS headers when BLAS is enabled
+#ifdef EIGEN_USE_AOCL_BLAS
+#if defined(__has_include)
+#if __has_include("cblas.h")
+#include "cblas.h"
+#elif __has_include("blis.h")
+#include "blis.h"
+#endif
+#else
+// Fallback
+#include "cblas.h"
+#endif
+#endif
+
+namespace Eigen {
+// AOCL-specific type definitions
+typedef std::complex<double> dcomplex;
+typedef std::complex<float> scomplex;
+typedef int BlasIndex; // Standard BLAS index type
+} // namespace Eigen
+
+#endif // EIGEN_USE_AOCL_ALL || EIGEN_USE_AOCL_MT
+
+#endif // EIGEN_AOCL_SUPPORT_H
--- a/bench/README.txt
+++ b/bench/README.txt
@@ -53,3 +53,56 @@ $ ./bench_multi_compilers.sh ompbench.cxxlist ompbenchmark.cpp



+************************
+* benchmark_aocl       *
+************************
+
+This benchmark exercises Eigen operations using AMD Optimized Libraries
+(AOCL). It is disabled by default and can be enabled when configuring the
+build:
+
+  cmake .. -DEIGEN_BUILD_AOCL_BENCH=ON
+
+The resulting `benchmark_aocl` target is compiled with `-O3` and, if the
+compiler supports it, `-march=znver5` for optimal performance on AMD
+processors.
+
+The benchmark also links to `libblis-mt.so` and `libflame.so` so BLAS and
+LAPACK operations run with multithreaded AOCL when available.
+
+By default the CMake build defines `EIGEN_USE_AOCL_MT` via the option
+`EIGEN_AOCL_BENCH_USE_MT` (enabled).  Set this option to `OFF` if you want
+to build the benchmark using the single-threaded AOCL libraries instead,
+in which case `EIGEN_USE_AOCL_ALL` is defined.
+
+
+
+Alternatively you can build the same benchmark using the
+`Makefile` in this directory. This allows experimenting with
+different compiler flags without reconfiguring CMake:
+
+```
+cd bench && make       # builds with -O3 -march=znver5 by default
+make clean && make CXX="clang++" ## For differnt compiler apart from g++
+make clean && make MARCH="" CXXFLAGS="-O2"  # example of custom flags
+make AOCL_ROOT=/opt/aocl            # use AOCL from a custom location
+
+This Makefile links against `libblis-mt.so` and `libflame.so` so the
+matrix multiplication benchmark exercises multithreaded BLIS when
+`EIGEN_USE_AOCL_MT` is defined (enabled by default in the Makefile).
+
+If you prefer to compile manually, ensure that the Eigen include path
+points to the directory where `AOCL_Support.h` resides. For example:
+
+
+clang++ -O3 -std=c++14 -I../build/install/include \
+        -march=znver5 -DEIGEN_USE_AOCL_MT \
+        benchmark_aocl.cpp -o benchmark_aocl \
+        -lblis-mt -lflame -lamdlibm -lpthread -lm
+```
+Replace `../install/include` with your actual Eigen install path.
+
+When invoking `make`, you can point `AOCL_ROOT` to your AOCL
+installation directory so the Makefile links against `$(AOCL_ROOT)/lib`.
+
+
--- a/bench/benchmark_aocl.cpp
+++ b/bench/benchmark_aocl.cpp
@@ -0,0 +1,362 @@
+/*
+ * benchmark_aocl.cpp - AOCL Performance Benchmark Suite for Eigen
+ *
+ * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Description:
+ * ------------
+ * This benchmark suite evaluates the performance of Eigen mathematical
+ * operations when integrated with AMD Optimizing CPU Libraries (AOCL). It
+ * tests:
+ *
+ * 1. Vector Math Operations: Transcendental functions (exp, sin, cos, sqrt,
+ * log, etc.) using AOCL Vector Math Library (VML) for optimized
+ * double-precision operations
+ *
+ * 2. Matrix Operations: BLAS Level-3 operations (DGEMM) using AOCL BLAS library
+ *    with support for both single-threaded and multithreaded execution
+ *
+ * 3. Linear Algebra: LAPACK operations (eigenvalue decomposition) using
+ * libflame
+ *
+ * 4. Real-world Scenarios: Financial risk computation simulating covariance
+ * matrix calculations and eigenvalue analysis for portfolio optimization
+ *
+ * The benchmark automatically detects AOCL configuration and adjusts test
+ * execution accordingly, providing performance comparisons between standard
+ * Eigen operations and AOCL-accelerated implementations.
+ *
+ * Compilation:
+ * ------------
+ * # Using AOCC compiler (recommended for best AOCL compatibility):
+ * clang++ -O3 -g -DEIGEN_USE_AOCL_ALL -I<PATH_TO_EIGEN_INCLUDE>
+ * -I${AOCL_ROOT}/include \
+ *         -Wno-parentheses src/benchmark_aocl.cpp -L${AOCL_ROOT}/lib \
+ *         -lamdlibm -lm -lblis -lflame -lpthread -lrt -pthread \
+ *         -o build/eigen_aocl_benchmark
+ *
+ * # Alternative: Using GCC with proper library paths:
+ * g++ -O3 -g -DEIGEN_USE_AOCL_ALL -I<PATH_TO_EIGEN_INCLUDE>
+ * -I${AOCL_ROOT}/include \
+ *     -Wno-parentheses src/benchmark_aocl.cpp -L${AOCL_ROOT}/lib \
+ *     -lamdlibm -lm -lblis -lflame -lpthread -lrt \
+ *     -o build/eigen_aocl_benchmark
+ *
+ * # For multithreaded BLIS support:
+ * clang++ -O3 -g -fopenmp -DEIGEN_USE_AOCL_MT -I<PATH_TO_EIGEN_INCLUDE> \
+ *         -I${AOCL_ROOT}/include -Wno-parentheses src/benchmark_aocl.cpp \
+ *         -L${AOCL_ROOT}/lib -lamdlibm -lm -lblis-mt -lflame -lpthread -lrt \
+ *         -o build/eigen_aocl_benchmark_mt
+ *
+ * Usage:
+ * ------
+ * export AOCL_ROOT=/path/to/aocl/installation
+ * export LD_LIBRARY_PATH=$AOCL_ROOT/lib:$LD_LIBRARY_PATH
+ * ./build/eigen_aocl_benchmark
+ *
+ * Developer:
+ * ----------
+ * Name: Sharad Saurabh Bhaskar
+ * Email: shbhaska@amd.com
+ * Organization: Advanced Micro Devices, Inc.
+ */
+
+#include <chrono>
+#include <cstdlib>
+#include <iostream>
+#include <thread>
+#include <vector>
+
+// Simple - just include Eigen headers
+#include <Eigen/Core>
+#include <Eigen/Dense>
+#include <Eigen/Eigenvalues>
+
+// Only include CBLAS if AOCL BLIS is available
+#ifdef EIGEN_USE_AOCL_ALL
+#include <cblas.h>
+#endif
+
+using namespace std;
+using namespace std::chrono;
+using namespace Eigen;
+
+void benchmarkVectorMath(int size) {
+  VectorXd v = VectorXd::LinSpaced(size, 0.1, 10.0);
+  VectorXd result(size);
+  double elapsed_ms = 0;
+
+  cout << "\n--- Vector Math Benchmark (size = " << size << ") ---" << endl;
+
+  auto start = high_resolution_clock::now();
+  result = v.array().exp();
+  auto end = high_resolution_clock::now();
+  elapsed_ms = duration_cast<milliseconds>(end - start).count();
+  cout << "exp() time: " << elapsed_ms << " ms" << endl;
+
+  start = high_resolution_clock::now();
+  result = v.array().sin();
+  end = high_resolution_clock::now();
+  elapsed_ms = duration_cast<milliseconds>(end - start).count();
+  cout << "sin() time: " << elapsed_ms << " ms" << endl;
+
+  start = high_resolution_clock::now();
+  result = v.array().cos();
+  end = high_resolution_clock::now();
+  elapsed_ms = duration_cast<milliseconds>(end - start).count();
+  cout << "cos() time: " << elapsed_ms << " ms" << endl;
+
+  start = high_resolution_clock::now();
+  result = v.array().sqrt();
+  end = high_resolution_clock::now();
+  elapsed_ms = duration_cast<milliseconds>(end - start).count();
+  cout << "sqrt() time: " << elapsed_ms << " ms" << endl;
+
+  start = high_resolution_clock::now();
+  result = v.array().cbrt();
+  end = high_resolution_clock::now();
+  elapsed_ms = duration_cast<milliseconds>(end - start).count();
+  cout << "cbrt() time: " << elapsed_ms << " ms" << endl;
+
+  start = high_resolution_clock::now();
+  result = v.array().abs();
+  end = high_resolution_clock::now();
+  elapsed_ms = duration_cast<milliseconds>(end - start).count();
+  cout << "abs() time: " << elapsed_ms << " ms" << endl;
+
+  start = high_resolution_clock::now();
+  result = v.array().log();
+  end = high_resolution_clock::now();
+  elapsed_ms = duration_cast<milliseconds>(end - start).count();
+  cout << "log() time: " << elapsed_ms << " ms" << endl;
+
+  start = high_resolution_clock::now();
+  result = v.array().log10();
+  end = high_resolution_clock::now();
+  elapsed_ms = duration_cast<milliseconds>(end - start).count();
+  cout << "log10() time: " << elapsed_ms << " ms" << endl;
+
+  start = high_resolution_clock::now();
+  result = v.array().exp2();
+  end = high_resolution_clock::now();
+  elapsed_ms = duration_cast<milliseconds>(end - start).count();
+  cout << "exp2() time: " << elapsed_ms << " ms" << endl;
+
+  start = high_resolution_clock::now();
+  result = v.array().asin();
+  end = high_resolution_clock::now();
+  elapsed_ms = duration_cast<milliseconds>(end - start).count();
+  cout << "asin() time: " << elapsed_ms << " ms" << endl;
+
+  start = high_resolution_clock::now();
+  result = v.array().sinh();
+  end = high_resolution_clock::now();
+  elapsed_ms = duration_cast<milliseconds>(end - start).count();
+  cout << "sinh() time: " << elapsed_ms << " ms" << endl;
+
+  start = high_resolution_clock::now();
+  result = v.array().acos();
+  end = high_resolution_clock::now();
+  elapsed_ms = duration_cast<milliseconds>(end - start).count();
+  cout << "acos() time: " << elapsed_ms << " ms" << endl;
+
+  start = high_resolution_clock::now();
+  result = v.array().cosh();
+  end = high_resolution_clock::now();
+  elapsed_ms = duration_cast<milliseconds>(end - start).count();
+  cout << "cosh() time: " << elapsed_ms << " ms" << endl;
+
+  start = high_resolution_clock::now();
+  result = v.array().tan();
+  end = high_resolution_clock::now();
+  elapsed_ms = duration_cast<milliseconds>(end - start).count();
+  cout << "tan() time: " << elapsed_ms << " ms" << endl;
+
+  start = high_resolution_clock::now();
+  result = v.array().atan();
+  end = high_resolution_clock::now();
+  elapsed_ms = duration_cast<milliseconds>(end - start).count();
+  cout << "atan() time: " << elapsed_ms << " ms" << endl;
+
+  start = high_resolution_clock::now();
+  result = v.array().tanh();
+  end = high_resolution_clock::now();
+  elapsed_ms = duration_cast<milliseconds>(end - start).count();
+  cout << "tanh() time: " << elapsed_ms << " ms" << endl;
+
+  VectorXd v2 = VectorXd::Random(size);
+  start = high_resolution_clock::now();
+  result = v.array() + v2.array();
+  end = high_resolution_clock::now();
+  elapsed_ms = duration_cast<milliseconds>(end - start).count();
+  cout << "add() time: " << elapsed_ms << " ms" << endl;
+
+  start = high_resolution_clock::now();
+  result = v.array().pow(2.0);
+  end = high_resolution_clock::now();
+  elapsed_ms = duration_cast<milliseconds>(end - start).count();
+  cout << "pow() time: " << elapsed_ms << " ms" << endl;
+
+  start = high_resolution_clock::now();
+  result = v.array().max(v2.array());
+  end = high_resolution_clock::now();
+  elapsed_ms = duration_cast<milliseconds>(end - start).count();
+  cout << "max() time: " << elapsed_ms << " ms" << endl;
+
+  start = high_resolution_clock::now();
+  result = v.array().min(v2.array());
+  end = high_resolution_clock::now();
+  elapsed_ms = duration_cast<milliseconds>(end - start).count();
+  cout << "min() time: " << elapsed_ms << " ms" << endl;
+}
+
+// Function to benchmark BLAS operation: Matrix multiplication.
+void benchmarkMatrixMultiplication(int matSize) {
+  cout << "\n--- BLIS-st DGEMM Benchmark (" << matSize << " x " << matSize
+       << ") ---" << endl;
+
+  MatrixXd A = MatrixXd::Random(matSize, matSize);
+  MatrixXd B = MatrixXd::Random(matSize, matSize);
+  MatrixXd C(matSize, matSize);
+
+  auto start = high_resolution_clock::now();
+  C = A * B;
+  auto end = high_resolution_clock::now();
+  double elapsed_ms = duration_cast<milliseconds>(end - start).count();
+  cout << "Matrix multiplication time: " << elapsed_ms << " ms" << endl;
+}
+
+// Benchmark BLIS directly using its CBLAS interface if available.
+void benchmarkBlisMultithreaded(int matSize, int numThreads) {
+#if defined(EIGEN_AOCL_USE_BLIS_MT)
+  cout << "\n--- BLIS-mt DGEMM Benchmark (" << matSize << " x " << matSize
+       << ", threads=" << numThreads << ") ---" << endl;
+  vector<double> A(matSize * matSize);
+  vector<double> B(matSize * matSize);
+  vector<double> C(matSize * matSize);
+  for (auto &v : A)
+    v = static_cast<double>(rand()) / RAND_MAX;
+  for (auto &v : B)
+    v = static_cast<double>(rand()) / RAND_MAX;
+  double alpha = 1.0, beta = 0.0;
+  string th = to_string(numThreads);
+  setenv("BLIS_NUM_THREADS", th.c_str(), 1);
+  auto start = high_resolution_clock::now();
+  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, matSize, matSize,
+              matSize, alpha, A.data(), matSize, B.data(), matSize, beta,
+              C.data(), matSize);
+  auto end = high_resolution_clock::now();
+  double elapsed_ms = duration_cast<milliseconds>(end - start).count();
+  cout << "BLIS dgemm time: " << elapsed_ms << " ms" << endl;
+#else
+  (void)matSize;
+  (void)numThreads;
+  cout << "\nBLIS multithreaded support not enabled." << endl;
+#endif
+}
+
+// Function to benchmark LAPACK operation: Eigenvalue decomposition.
+void benchmarkEigenDecomposition(int matSize) {
+  cout << "\n--- Eigenvalue Decomposition Benchmark (Matrix Size: " << matSize
+       << " x " << matSize << ") ---" << endl;
+  MatrixXd M = MatrixXd::Random(matSize, matSize);
+  // Make matrix symmetric (necessary for eigenvalue decomposition of
+  // self-adjoint matrices)
+  M = (M + M.transpose()) * 0.5;
+
+  SelfAdjointEigenSolver<MatrixXd> eigensolver;
+  auto start = high_resolution_clock::now();
+  eigensolver.compute(M);
+  auto end = high_resolution_clock::now();
+  double elapsed_ms = duration_cast<milliseconds>(end - start).count();
+  if (eigensolver.info() == Success) {
+    cout << "Eigenvalue decomposition time: " << elapsed_ms << " ms" << endl;
+  } else {
+    cout << "Eigenvalue decomposition failed." << endl;
+  }
+}
+
+// Function simulating a real-world FSI risk computation scenario.
+// Example: Compute covariance matrix from simulated asset returns, then perform
+// eigenvalue decomposition.
+void benchmarkFSIRiskComputation(int numPeriods, int numAssets) {
+  cout << "\n--- FSI Risk Computation Benchmark ---" << endl;
+  cout << "Simulating " << numPeriods << " periods for " << numAssets
+       << " assets." << endl;
+
+  // Simulate asset returns: each column represents an asset's returns.
+  MatrixXd returns = MatrixXd::Random(numPeriods, numAssets);
+
+  // Compute covariance matrix: cov = (returns^T * returns) / (numPeriods - 1)
+  auto start = high_resolution_clock::now();
+  MatrixXd cov = (returns.transpose() * returns) / (numPeriods - 1);
+  auto end = high_resolution_clock::now();
+  double cov_time = duration_cast<milliseconds>(end - start).count();
+  cout << "Covariance matrix computation time: " << cov_time << " ms" << endl;
+
+  // Eigenvalue decomposition on covariance matrix.
+  SelfAdjointEigenSolver<MatrixXd> eigensolver;
+  start = high_resolution_clock::now();
+  eigensolver.compute(cov);
+  end = high_resolution_clock::now();
+  double eig_time = duration_cast<milliseconds>(end - start).count();
+  if (eigensolver.info() == Success) {
+    cout << "Eigenvalue decomposition (covariance) time: " << eig_time << " ms"
+         << endl;
+    cout << "Top 3 Eigenvalues: "
+         << eigensolver.eigenvalues().tail(3).transpose() << endl;
+  } else {
+    cout << "Eigenvalue decomposition failed." << endl;
+  }
+}
+
+int main() {
+  cout << "=== AOCL Benchmark for Eigen on AMD Platforms ===" << endl;
+  cout << "Developer: Sharad Saurabh Bhaskar (shbhaska@amd.com)" << endl;
+  cout << "Organization: Advanced Micro Devices, Inc." << endl;
+  cout << "License: Mozilla Public License 2.0" << endl << endl;
+
+  // Print AOCL configuration
+#ifdef EIGEN_USE_AOCL_MT
+  cout << "AOCL Mode: MULTITHREADED (MT)" << endl;
+  cout << "Features: Multithreaded BLIS, AOCL VML, LAPACK" << endl;
+#elif defined(EIGEN_USE_AOCL_ALL)
+  cout << "AOCL Mode: SINGLE-THREADED (ALL)" << endl;
+  cout << "Features: Single-threaded BLIS, AOCL VML, LAPACK" << endl;
+#else
+  cout << "AOCL Mode: DISABLED" << endl;
+  cout << "Using standard Eigen implementation" << endl;
+#endif
+  cout << "Hardware threads available: " << thread::hardware_concurrency() << endl << endl;
+
+  // Benchmark vector math functions with varying vector sizes.
+  vector<int> vectorSizes = {5000000, 10000000, 50000000};
+  for (int size : vectorSizes) {
+    benchmarkVectorMath(size);
+  }
+
+  // Benchmark matrix multiplication for varying sizes.
+  vector<int> matrixSizes = {1024};
+  for (int msize : matrixSizes) {
+    benchmarkMatrixMultiplication(msize);
+#if defined(EIGEN_AOCL_USE_BLIS_MT)
+    benchmarkBlisMultithreaded(msize, thread::hardware_concurrency());
+#endif
+  }
+
+  // Benchmark LAPACK: Eigenvalue Decomposition.
+  for (int msize : matrixSizes) {
+    benchmarkEigenDecomposition(msize);
+  }
+
+  // Benchmark a complex FSI risk computation scenario.
+  // For example, simulate 10,000 time periods (days) for 500 assets.
+  benchmarkFSIRiskComputation(10000, 500);
+
+  cout << "\n=== Benchmark Complete ===" << endl;
+  return 0;
+}
--- a/cmake/FindAOCL.cmake
+++ b/cmake/FindAOCL.cmake
@@ -0,0 +1,264 @@
+
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+#
+# FindAOCL.cmake - CMake Module for AMD Optimizing CPU Libraries (AOCL)
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Description:
+# ------------
+# This CMake module locates and configures AMD Optimizing CPU Libraries (AOCL)
+# for high-performance mathematical computing on AMD processors. It searches for
+# and sets up the following AOCL components:
+#
+# 1. AOCL MathLib (libamdlibm): Vector Math Library providing optimized
+#    transcendental functions (exp, sin, cos, sqrt, log, etc.) with VRDA
+#    (Vector Rapid Double-precision Arithmetic) support for SIMD acceleration
+#
+# 2. AOCL BLAS (BLIS): Basic Linear Algebra Subprograms optimized for AMD
+#    architectures, supporting both single-threaded (libblis) and multithreaded
+#    (libblis-mt) execution with OpenMP parallelization
+#
+# 3. AOCL LAPACK (libflame): Linear Algebra PACKage providing dense matrix
+#    factorizations, eigenvalue/eigenvector computations, and linear system
+#    solvers optimized for AMD processors
+#
+# The module automatically detects the appropriate library variants based on
+# configuration flags and provides proper linking setup for optimal performance
+# on Zen, Zen2, Zen3, Zen4, and Zen5 architectures.
+#
+# Variables Set:
+# --------------
+# AOCL_FOUND          - True if AOCL libraries are found
+# AOCL_LIBRARIES      - List of AOCL libraries to link against
+# AOCL_INCLUDE_DIRS   - Include directories for AOCL headers
+# AOCL_BLAS_TYPE      - Type of BLIS library found ("multithreaded" or "single-threaded")
+# AOCL_CORE_LIB       - Path to core AOCL math library
+# AOCL_BLAS_LIB       - Path to AOCL BLAS library  
+# AOCL_LAPACK_LIB     - Path to AOCL LAPACK library
+#
+# Configuration Options:
+# ----------------------
+# EIGEN_AOCL_BENCH_USE_MT - When ON, searches for multithreaded BLIS first
+#                          When OFF, searches for single-threaded BLIS only
+#
+# # For multithreaded BLIS:
+# cmake .. -DEIGEN_AOCL_BENCH_USE_MT=ON
+# 
+# # For single-threaded BLIS:
+# cmake .. -DEIGEN_AOCL_BENCH_USE_MT=OFF
+#
+# Library Search Paths:
+# ---------------------
+# The module searches for AOCL libraries in the following order:
+# 1. ${AOCL_ROOT}/lib (or ${AOCL_ROOT}/lib32 for 32-bit)
+# 2. /opt/amd/aocl/lib64 (or /opt/amd/aocl/lib32 for 32-bit)  
+# 3. ${LIB_INSTALL_DIR}
+#
+# Expected Library Names:
+# -----------------------
+# Core MathLib: amdlibm, alm, almfast
+# BLAS Single:  blis
+# BLAS Multi:   blis-mt
+# LAPACK:       flame
+#
+# Dependencies:
+# -------------
+# The module automatically links the following system libraries:
+# - libm (standard math library)
+# - libpthread (POSIX threads)
+# - librt (real-time extensions)
+#
+# Architecture Support:
+# ---------------------
+# Optimized for AMD Zen family processors (Zen, Zen2, Zen3, Zen4, Zen5)
+# with automatic architecture detection and SIMD instruction selection.
+#
+# Developer:
+# ----------
+# Name: Sharad Saurabh Bhaskar
+# Email: shbhaska@amd.com
+#
+
+if(NOT DEFINED AOCL_ROOT)
+  if(DEFINED ENV{AOCL_ROOT})
+    set(AOCL_ROOT $ENV{AOCL_ROOT})
+    message(STATUS "AOCL_ROOT set from environment: ${AOCL_ROOT}")
+  else()
+    message(WARNING "AOCL_ROOT is not set. AOCL support will be disabled.")
+    set(AOCL_LIBRARIES "")
+  endif()
+endif()
+
+if(AOCL_LIBRARIES)
+  set(AOCL_FIND_QUIETLY TRUE)
+endif()
+
+# Determine default include directories
+set(AOCL_INCLUDE_DIRS "")
+if(AOCL_ROOT AND EXISTS "${AOCL_ROOT}/include")
+  list(APPEND AOCL_INCLUDE_DIRS "${AOCL_ROOT}/include")
+endif()
+if(EXISTS "/opt/amd/aocl/include")
+  list(APPEND AOCL_INCLUDE_DIRS "/opt/amd/aocl/include")
+endif()
+
+  if(${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "x86_64")
+    # Search for the core AOCL math library.
+    find_library(AOCL_CORE_LIB
+      NAMES amdlibm alm almfast
+      PATHS
+        ${AOCL_ROOT}/lib
+        /opt/amd/aocl/lib64
+        ${LIB_INSTALL_DIR}
+    )
+    if(AOCL_CORE_LIB)
+      message(STATUS "Found AOCL core library: ${AOCL_CORE_LIB}")
+    else()
+      message(WARNING "AOCL core library not found in ${AOCL_ROOT}/lib or default locations.")
+    endif()
+
+    # Conditional BLIS library search based on MT requirement
+    if(EIGEN_AOCL_BENCH_USE_MT)
+      # Search for multithreaded BLIS first
+      find_library(AOCL_BLAS_LIB
+        NAMES blis-mt 
+        PATHS
+          ${AOCL_ROOT}/lib
+          /opt/amd/aocl/lib64
+          ${LIB_INSTALL_DIR}
+      )
+      if(AOCL_BLAS_LIB)
+        message(STATUS "Found AOCL BLAS (MT) library: ${AOCL_BLAS_LIB}")
+        set(AOCL_BLAS_TYPE "multithreaded")
+      else()
+        message(WARNING "AOCL multithreaded BLAS library not found, falling back to single-threaded.")
+        find_library(AOCL_BLAS_LIB
+          NAMES blis
+          PATHS
+            ${AOCL_ROOT}/lib
+            /opt/amd/aocl/lib64
+            ${LIB_INSTALL_DIR}
+        )
+        set(AOCL_BLAS_TYPE "single-threaded")
+      endif()
+    else()
+      # Search for single-threaded BLIS
+      find_library(AOCL_BLAS_LIB
+        NAMES blis
+        PATHS
+          ${AOCL_ROOT}/lib
+          /opt/amd/aocl/lib64
+          ${LIB_INSTALL_DIR}
+      )
+      if(AOCL_BLAS_LIB)
+        message(STATUS "Found AOCL BLAS (ST) library: ${AOCL_BLAS_LIB}")
+        set(AOCL_BLAS_TYPE "single-threaded")
+      else()
+        message(WARNING "AOCL single-threaded BLAS library not found.")
+      endif()
+    endif()
+
+    # Now search for AOCL LAPACK library.
+    find_library(AOCL_LAPACK_LIB
+      NAMES flame
+      PATHS
+        ${AOCL_ROOT}/lib
+        /opt/amd/aocl/lib64
+        ${LIB_INSTALL_DIR}
+    )
+    if(AOCL_LAPACK_LIB)
+      message(STATUS "Found AOCL LAPACK library: ${AOCL_LAPACK_LIB}")
+    else()
+      message(WARNING "AOCL LAPACK library not found in ${AOCL_ROOT}/lib or default locations.")
+    endif()
+
+  else()
+    # For 32-bit systems, similar search paths.
+    find_library(AOCL_CORE_LIB
+      NAMES amdlibm alm almfast
+      PATHS
+        ${AOCL_ROOT}/lib
+        /opt/amd/aocl/lib32
+        ${LIB_INSTALL_DIR}
+    )
+    if(AOCL_CORE_LIB)
+      message(STATUS "Found AOCL core library: ${AOCL_CORE_LIB}")
+    else()
+      message(WARNING "AOCL core library not found in ${AOCL_ROOT}/lib or default locations.")
+    endif()
+
+    # Conditional BLIS library search for 32-bit
+    if(EIGEN_AOCL_BENCH_USE_MT)
+      find_library(AOCL_BLAS_LIB
+        NAMES blis-mt 
+        PATHS
+          ${AOCL_ROOT}/lib
+          /opt/amd/aocl/lib32
+          ${LIB_INSTALL_DIR}
+      )
+      if(AOCL_BLAS_LIB)
+        message(STATUS "Found AOCL BLAS (MT) library: ${AOCL_BLAS_LIB}")
+        set(AOCL_BLAS_TYPE "multithreaded")
+      else()
+        message(WARNING "AOCL multithreaded BLAS library not found, falling back to single-threaded.")
+        find_library(AOCL_BLAS_LIB
+          NAMES blis
+          PATHS
+            ${AOCL_ROOT}/lib
+            /opt/amd/aocl/lib32
+            ${LIB_INSTALL_DIR}
+        )
+        set(AOCL_BLAS_TYPE "single-threaded")
+      endif()
+    else()
+      find_library(AOCL_BLAS_LIB
+        NAMES blis
+        PATHS
+          ${AOCL_ROOT}/lib
+          /opt/amd/aocl/lib32
+          ${LIB_INSTALL_DIR}
+      )
+      if(AOCL_BLAS_LIB)
+        message(STATUS "Found AOCL BLAS (ST) library: ${AOCL_BLAS_LIB}")
+        set(AOCL_BLAS_TYPE "single-threaded")
+      else()
+        message(WARNING "AOCL single-threaded BLAS library not found.")
+      endif()
+    endif()
+
+    find_library(AOCL_LAPACK_LIB
+      NAMES flame
+      PATHS
+        ${AOCL_ROOT}/lib
+        /opt/amd/aocl/lib32
+        ${LIB_INSTALL_DIR}
+    )
+    if(AOCL_LAPACK_LIB)
+      message(STATUS "Found AOCL LAPACK library: ${AOCL_LAPACK_LIB}")
+    else()
+      message(WARNING "AOCL LAPACK library not found in ${AOCL_ROOT}/lib or default locations.")
+    endif()
+endif()
+
+# Combine the found libraries into one variable.
+if(AOCL_CORE_LIB)
+  set(AOCL_LIBRARIES ${AOCL_CORE_LIB})
+endif()
+if(AOCL_BLAS_LIB)
+  list(APPEND AOCL_LIBRARIES ${AOCL_BLAS_LIB})
+endif()
+if(AOCL_LAPACK_LIB)
+  list(APPEND AOCL_LIBRARIES ${AOCL_LAPACK_LIB})
+endif()
+if(AOCL_LIBRARIES)
+  # Link against the standard math and pthread libraries as well as librt
+  list(APPEND AOCL_LIBRARIES m pthread rt)
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(AOCL DEFAULT_MSG AOCL_LIBRARIES AOCL_INCLUDE_DIRS)
+mark_as_advanced(AOCL_LIBRARIES AOCL_INCLUDE_DIRS)
--- a/doc/UsingAOCL.dox
+++ b/doc/UsingAOCL.dox
@@ -0,0 +1,289 @@
+/*
+ Copyright (c) 2025, AMD Inc. All rights reserved.
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of AMD nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ ********************************************************************************
+ * Content : Documentation on the use of AMD AOCL through Eigen
+ ********************************************************************************
+*/
+
+namespace Eigen {
+
+/** \page TopicUsingAOCL Using AMD® AOCL from %Eigen
+
+Since %Eigen version 3.4 and later, users can benefit from built-in AMD® Optimizing CPU Libraries (AOCL) optimizations with an installed copy of AOCL 5.0 (or later).
+
+<a href="https://www.amd.com/en/developer/aocl.html"> AMD AOCL </a> provides highly optimized, multi-threaded mathematical routines for x86-64 processors with a focus on AMD "Zen"-based architectures. AOCL is available on Linux and Windows for x86-64 architectures.
+
+\note
+AMD® AOCL is freely available software, but it is the responsibility of users to download, install, and ensure their product's license allows linking to the AOCL libraries. AOCL is distributed under a permissive license that allows commercial use.
+
+Using AMD AOCL through %Eigen is straightforward:
+-# export \c AOCL_ROOT into your environment 
+-# define one of the AOCL macros before including any %Eigen headers (see table below)
+-# link your program to AOCL libraries (BLIS, FLAME, LibM)
+-# ensure your system supports the target architecture optimizations
+
+When doing so, a number of %Eigen's algorithms are silently substituted with calls to AMD AOCL routines.
+These substitutions apply only for \b Dynamic \b or \b large \b enough objects with one of the following standard scalar types: \c float, \c double, \c complex<float>, and \c complex<double>.
+Operations on other scalar types or mixing reals and complexes will continue to use the built-in algorithms.
+
+The AOCL integration targets three core components:
+- **BLIS**: High-performance BLAS implementation optimized for modern cache hierarchies
+- **FLAME**: Dense linear algebra algorithms providing LAPACK functionality  
+- **LibM**: Optimized standard math routines with vectorized implementations
+
+\section TopicUsingAOCL_Macros Configuration Macros
+
+You can choose which parts will be substituted by defining one or multiple of the following macros:
+
+<table class="manual">
+<tr><td>\c EIGEN_USE_BLAS </td><td>Enables the use of external BLAS level 2 and 3 routines (AOCL-BLIS)</td></tr>
+<tr class="alt"><td>\c EIGEN_USE_LAPACKE </td><td>Enables the use of external LAPACK routines via the LAPACKE C interface (AOCL-FLAME)</td></tr>
+<tr><td>\c EIGEN_USE_LAPACKE_STRICT </td><td>Same as \c EIGEN_USE_LAPACKE but algorithms of lower robustness are disabled. \n This currently concerns only JacobiSVD which would be replaced by \c gesvd.</td></tr>
+<tr class="alt"><td>\c EIGEN_USE_AOCL_VML </td><td>Enables the use of AOCL LibM vector math operations for coefficient-wise functions</td></tr>
+<tr><td>\c EIGEN_USE_AOCL_ALL </td><td>Defines \c EIGEN_USE_BLAS, \c EIGEN_USE_LAPACKE, and \c EIGEN_USE_AOCL_VML</td></tr>
+<tr class="alt"><td>\c EIGEN_USE_AOCL_MT </td><td>Equivalent to \c EIGEN_USE_AOCL_ALL, but ensures multi-threaded BLIS (\c libblis-mt) is used. \n \b Recommended for most applications.</td></tr>
+</table>
+
+\note The AOCL integration automatically enables optimizations when the matrix/vector size exceeds \c EIGEN_AOCL_VML_THRESHOLD (default: 128 elements). For smaller operations, Eigen's built-in vectorization may be faster due to function call overhead.
+
+\section TopicUsingAOCL_Performance Performance Considerations
+
+The \c EIGEN_USE_BLAS and \c EIGEN_USE_LAPACKE macros can be combined with AOCL-specific optimizations:
+
+- **Multi-threading**: Use \c EIGEN_USE_AOCL_MT to automatically select the multi-threaded BLIS library
+- **Architecture targeting**: AOCL libraries are optimized for AMD Zen architectures (Zen, Zen2, Zen3, Zen4, Zen5)
+- **Vector Math Library**: AOCL LibM provides vectorized implementations that can operate on entire arrays simultaneously
+- **Memory layout**: Eigen's column-major storage directly matches AOCL's expected data layout for zero-copy operation
+
+\section TopicUsingAOCL_Types Supported Data Types and Sizes
+
+AOCL acceleration is applied to:
+- **Scalar types**: \c float, \c double, \c complex<float>, \c complex<double>
+- **Matrix/Vector sizes**: Dynamic size or compile-time size ≥ \c EIGEN_AOCL_VML_THRESHOLD  
+- **Storage order**: Both column-major (default) and row-major layouts
+- **Memory alignment**: Eigen's data pointers are directly compatible with AOCL function signatures
+
+The current AOCL Vector Math Library integration is specialized for \c double precision, with automatic fallback to scalar implementations for \c float.
+
+\section TopicUsingAOCL_Functions Vector Math Functions
+
+The following table summarizes coefficient-wise operations accelerated by \c EIGEN_USE_AOCL_VML:
+
+<table class="manual">
+<tr><th>Code example</th><th>AOCL routines</th></tr>
+<tr><td>\code
+v2 = v1.array().exp();
+v2 = v1.array().sin();
+v2 = v1.array().cos();
+v2 = v1.array().tan();
+v2 = v1.array().log();
+v2 = v1.array().log10();
+v2 = v1.array().log2();
+v2 = v1.array().sqrt();
+v2 = v1.array().pow(1.5);
+v2 = v1.array() + v2.array();
+\endcode</td><td>\code
+amd_vrda_exp
+amd_vrda_sin
+amd_vrda_cos
+amd_vrda_tan
+amd_vrda_log
+amd_vrda_log10
+amd_vrda_log2
+amd_vrda_sqrt
+amd_vrda_pow
+amd_vrda_add
+\endcode</td></tr>
+</table>
+
+In the examples, v1 and v2 are dense vectors of type \c VectorXd with size ≥ \c EIGEN_AOCL_VML_THRESHOLD.
+
+\section TopicUsingAOCL_Example Complete Example
+
+\code
+#define EIGEN_USE_AOCL_MT
+#include <iostream>
+#include <Eigen/Dense>
+
+int main() {
+    const int n = 2048;
+    
+    // Large matrices automatically use AOCL-BLIS for multiplication
+    Eigen::MatrixXd A = Eigen::MatrixXd::Random(n, n);
+    Eigen::MatrixXd B = Eigen::MatrixXd::Random(n, n);
+    Eigen::MatrixXd C = A * B;  // Dispatched to dgemm
+    
+    // Large vectors automatically use AOCL LibM for math functions
+    Eigen::VectorXd v = Eigen::VectorXd::LinSpaced(10000, 0, 10);
+    Eigen::VectorXd result = v.array().sin();  // Dispatched to amd_vrda_sin
+    
+    // LAPACK decompositions use AOCL-FLAME
+    Eigen::LLT<Eigen::MatrixXd> llt(A);  // Dispatched to dpotrf
+    
+    std::cout << "Matrix norm: " << C.norm() << std::endl;
+    std::cout << "Vector result norm: " << result.norm() << std::endl;
+    
+    return 0;
+}
+\endcode
+
+\section TopicUsingAOCL_Building Building and Linking
+
+To compile with AOCL support, set the \c AOCL_ROOT environment variable and link against the required libraries:
+
+\code
+export AOCL_ROOT=/path/to/aocl
+clang++ -O3 -g -DEIGEN_USE_AOCL_ALL \
+        -I./install/include -I${AOCL_ROOT}/include \
+        -Wno-parentheses my_app.cpp \
+        -L${AOCL_ROOT} -lamdlibm -lflame -lblis \
+        -lpthread -lrt -lm -lomp \
+        -o eigen_aocl_example
+\endcode
+
+For multi-threaded performance, use the multi-threaded BLIS library:
+\code
+clang++ -O3 -g -DEIGEN_USE_AOCL_MT \
+        -I./install/include -I${AOCL_ROOT}/include \
+        -Wno-parentheses my_app.cpp \
+        -L${AOCL_ROOT} -lamdlibm -lflame -lblis-mt \
+        -lpthread -lrt -lm -lomp \
+        -o eigen_aocl_example
+\endcode
+
+Key compiler and linker flags:
+- \c -DEIGEN_USE_AOCL_ALL: Enable all AOCL accelerations (BLAS, LAPACK, VML)
+- \c -DEIGEN_USE_AOCL_MT: Enable multi-threaded version (uses \c -lblis-mt)
+- \c -lblis: Single-threaded BLIS library
+- \c -lblis-mt: Multi-threaded BLIS library (recommended for performance)
+- \c -lflame: FLAME LAPACK implementation  
+- \c -lamdlibm: AMD LibM vector math library
+- \c -lomp: OpenMP runtime for multi-threading support
+- \c -lpthread -lrt: System threading and real-time libraries
+- \c -Wno-parentheses: Suppress common warnings when using AOCL headers
+
+\subsection TopicUsingAOCL_EigenBuild Building Eigen with AOCL Support
+
+To build Eigen with AOCL Support, use the following CMake configuration:
+
+\code
+cmake .. -DCMAKE_BUILD_TYPE=Release \
+         -DCMAKE_C_COMPILER=clang \
+         -DCMAKE_CXX_COMPILER=clang++ \
+         -DCMAKE_INSTALL_PREFIX=$PWD/install \
+         -DINCLUDE_INSTALL_DIR=$PWD/install/include \
+      && make install -j$(nproc)
+\endcode
+
+
+To build Eigen with AOCL integration and benchmarking capabilities, use the following CMake configuration:
+
+\code
+cmake .. -DEIGEN_BUILD_AOCL_BENCH=ON \
+         -DEIGEN_AOCL_BENCH_FLAGS="-O3 -mavx512f -fveclib=AMDLIBM" \
+         -DEIGEN_AOCL_BENCH_USE_MT=OFF \
+         -DEIGEN_AOCL_BENCH_ARCH=znver5 \
+         -DCMAKE_BUILD_TYPE=Debug \
+         -DCMAKE_C_COMPILER=clang \
+         -DCMAKE_CXX_COMPILER=clang++ \
+         -DCMAKE_INSTALL_PREFIX=$PWD/install \
+         -DINCLUDE_INSTALL_DIR=$PWD/install/include \
+      && make install -j$(nproc)
+\endcode
+
+**CMake Configuration Parameters:**
+
+<table class="manual">
+<tr><th>Parameter</th><th>Expected Values</th><th>Description</th></tr>
+<tr><td>\c EIGEN_BUILD_AOCL_BENCH</td><td>\c ON, \c OFF</td><td>Enable/disable AOCL benchmark compilation</td></tr>
+<tr class="alt"><td>\c EIGEN_AOCL_BENCH_FLAGS</td><td>Compiler flags string</td><td>Additional compiler optimizations: \c "-O3 -mavx512f -fveclib=AMDLIBM"</td></tr>
+<tr><td>\c EIGEN_AOCL_BENCH_USE_MT</td><td>\c ON, \c OFF</td><td>Use multi-threaded AOCL libraries (\c ON recommended for performance)</td></tr>
+<tr class="alt"><td>\c EIGEN_AOCL_BENCH_ARCH</td><td>\c znver3, \c znver4, \c znver5, \c native, \c generic</td><td>Target AMD architecture (match your CPU generation)</td></tr>
+<tr><td>\c CMAKE_BUILD_TYPE</td><td>\c Release, \c Debug, \c RelWithDebInfo</td><td>Build configuration (\c Release recommended for benchmarks)</td></tr>
+<tr class="alt"><td>\c CMAKE_C_COMPILER</td><td>\c clang, \c gcc</td><td>C compiler (clang recommended for AOCL)</td></tr>
+<tr><td>\c CMAKE_CXX_COMPILER</td><td>\c clang++, \c g++</td><td>C++ compiler (clang++ recommended for AOCL)</td></tr>
+<tr class="alt"><td>\c CMAKE_INSTALL_PREFIX</td><td>Installation path</td><td>Where to install Eigen headers</td></tr>
+<tr><td>\c INCLUDE_INSTALL_DIR</td><td>Header path</td><td>Specific path for Eigen headers</td></tr>
+</table>
+
+**Architecture Selection Guide:**
+- \c znver3: AMD Zen 3 (EPYC 7003, Ryzen 5000 series)
+- \c znver4: AMD Zen 4 (EPYC 9004, Ryzen 7000 series)  
+- \c znver5: AMD Zen 5 (EPYC 9005, Ryzen 9000 series)
+- \c native: Auto-detect current CPU architecture
+- \c generic: Generic x86-64 without specific optimizations
+
+**Custom Compiler Flags Explanation:**
+- \c -O3: Maximum optimization level
+- \c -mavx512f: Enable AVX-512 instruction set (if supported)
+- \c -fveclib=AMDLIBM: Use AMD LibM for vectorized math functions
+
+\subsection TopicUsingAOCL_Benchmark Building the AOCL Benchmark
+
+After configuring Eigen, build the AOCL benchmark executable:
+
+\code
+cmake --build . --target benchmark_aocl -j$(nproc)
+\endcode
+
+This creates the \c benchmark_aocl executable that demonstrates AOCL acceleration with various matrix sizes and operations.
+
+**Running the Benchmark:**
+\code
+./benchmark_aocl
+\endcode
+
+The benchmark will automatically compare:
+- Eigen's native performance vs AOCL-accelerated operations
+- Matrix multiplication performance (BLIS vs Eigen)
+- Vector math functions performance (LibM vs Eigen)
+- Memory bandwidth utilization and cache efficiency
+
+\section TopicUsingAOCL_CMake CMake Integration
+
+When using CMake, you can use a FindAOCL module:
+
+\code
+find_package(AOCL REQUIRED)
+target_compile_definitions(my_target PRIVATE EIGEN_USE_AOCL_MT)
+target_link_libraries(my_target PRIVATE AOCL::BLIS_MT AOCL::FLAME AOCL::LIBM)
+\endcode
+
+\section TopicUsingAOCL_Troubleshooting Troubleshooting
+
+Common issues and solutions:
+
+- **Link errors**: Ensure \c AOCL_ROOT is set and libraries are in \c LD_LIBRARY_PATH
+- **Performance not improved**: Verify you're using matrices/vectors larger than the threshold
+- **Thread contention**: Set \c OMP_NUM_THREADS to match your CPU core count
+- **Architecture mismatch**: Use appropriate \c -march flag for your AMD processor
+
+\section TopicUsingAOCL_Links Links
+
+- AMD AOCL can be downloaded for free <a href="https://www.amd.com/en/developer/aocl.html">here</a>
+- AOCL User Guide and documentation available on the AMD Developer Portal
+- AOCL is also available through package managers and containerized environments
+
+*/
+
+}