eigen/bench/benchmark_aocl.cpp

/*
 * benchmark_aocl.cpp - AOCL Performance Benchmark Suite for Eigen
 *
 * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 *
 * Description:
 * ------------
 * This benchmark suite evaluates the performance of Eigen mathematical
 * operations when integrated with AMD Optimizing CPU Libraries (AOCL). It
 * tests:
 *
 * 1. Vector Math Operations: Transcendental functions (exp, sin, cos, sqrt,
 * log, etc.) using AOCL Vector Math Library (VML) for optimized
 * double-precision operations
 *
 * 2. Matrix Operations: BLAS Level-3 operations (DGEMM) using AOCL BLAS library
 *    with support for both single-threaded and multithreaded execution
 *
 * 3. Linear Algebra: LAPACK operations (eigenvalue decomposition) using
 * libflame
 *
 * 4. Real-world Scenarios: Financial risk computation simulating covariance
 * matrix calculations and eigenvalue analysis for portfolio optimization
 *
 * The benchmark automatically detects AOCL configuration and adjusts test
 * execution accordingly, providing performance comparisons between standard
 * Eigen operations and AOCL-accelerated implementations.
 *
 * Compilation:
 * ------------
 * # Using AOCC compiler (recommended for best AOCL compatibility):
 * clang++ -O3 -g -DEIGEN_USE_AOCL_ALL -I<PATH_TO_EIGEN_INCLUDE>
 * -I${AOCL_ROOT}/include \
 *         -Wno-parentheses src/benchmark_aocl.cpp -L${AOCL_ROOT}/lib \
 *         -lamdlibm -lm -lblis -lflame -lpthread -lrt -pthread \
 *         -o build/eigen_aocl_benchmark
 *
 * # Alternative: Using GCC with proper library paths:
 * g++ -O3 -g -DEIGEN_USE_AOCL_ALL -I<PATH_TO_EIGEN_INCLUDE>
 * -I${AOCL_ROOT}/include \
 *     -Wno-parentheses src/benchmark_aocl.cpp -L${AOCL_ROOT}/lib \
 *     -lamdlibm -lm -lblis -lflame -lpthread -lrt \
 *     -o build/eigen_aocl_benchmark
 *
 * # For multithreaded BLIS support:
 * clang++ -O3 -g -fopenmp -DEIGEN_USE_AOCL_MT -I<PATH_TO_EIGEN_INCLUDE> \
 *         -I${AOCL_ROOT}/include -Wno-parentheses src/benchmark_aocl.cpp \
 *         -L${AOCL_ROOT}/lib -lamdlibm -lm -lblis-mt -lflame -lpthread -lrt \
 *         -o build/eigen_aocl_benchmark_mt
 *
 * Usage:
 * ------
 * export AOCL_ROOT=/path/to/aocl/installation
 * export LD_LIBRARY_PATH=$AOCL_ROOT/lib:$LD_LIBRARY_PATH
 * ./build/eigen_aocl_benchmark
 *
 * Developer:
 * ----------
 * Name: Sharad Saurabh Bhaskar
 * Email: shbhaska@amd.com
 * Organization: Advanced Micro Devices, Inc.
 */

#include <chrono>
#include <cstdlib>
#include <iostream>
#include <thread>
#include <vector>

// Simple - just include Eigen headers
#include <Eigen/Core>
#include <Eigen/Dense>
#include <Eigen/Eigenvalues>

// Only include CBLAS if AOCL BLIS is available
#ifdef EIGEN_USE_AOCL_ALL
#include <cblas.h>
#endif

using namespace std;
using namespace std::chrono;
using namespace Eigen;

void benchmarkVectorMath(int size) {
  VectorXd v = VectorXd::LinSpaced(size, 0.1, 10.0);
  VectorXd result(size);
  double elapsed_ms = 0;

  cout << "\n--- Vector Math Benchmark (size = " << size << ") ---" << endl;

  auto start = high_resolution_clock::now();
  result = v.array().exp();
  auto end = high_resolution_clock::now();
  elapsed_ms = duration_cast<milliseconds>(end - start).count();
  cout << "exp() time: " << elapsed_ms << " ms" << endl;

  start = high_resolution_clock::now();
  result = v.array().sin();
  end = high_resolution_clock::now();
  elapsed_ms = duration_cast<milliseconds>(end - start).count();
  cout << "sin() time: " << elapsed_ms << " ms" << endl;

  start = high_resolution_clock::now();
  result = v.array().cos();
  end = high_resolution_clock::now();
  elapsed_ms = duration_cast<milliseconds>(end - start).count();
  cout << "cos() time: " << elapsed_ms << " ms" << endl;

  start = high_resolution_clock::now();
  result = v.array().sqrt();
  end = high_resolution_clock::now();
  elapsed_ms = duration_cast<milliseconds>(end - start).count();
  cout << "sqrt() time: " << elapsed_ms << " ms" << endl;

  start = high_resolution_clock::now();
  result = v.array().cbrt();
  end = high_resolution_clock::now();
  elapsed_ms = duration_cast<milliseconds>(end - start).count();
  cout << "cbrt() time: " << elapsed_ms << " ms" << endl;

  start = high_resolution_clock::now();
  result = v.array().abs();
  end = high_resolution_clock::now();
  elapsed_ms = duration_cast<milliseconds>(end - start).count();
  cout << "abs() time: " << elapsed_ms << " ms" << endl;

  start = high_resolution_clock::now();
  result = v.array().log();
  end = high_resolution_clock::now();
  elapsed_ms = duration_cast<milliseconds>(end - start).count();
  cout << "log() time: " << elapsed_ms << " ms" << endl;

  start = high_resolution_clock::now();
  result = v.array().log10();
  end = high_resolution_clock::now();
  elapsed_ms = duration_cast<milliseconds>(end - start).count();
  cout << "log10() time: " << elapsed_ms << " ms" << endl;

  start = high_resolution_clock::now();
  result = v.array().exp2();
  end = high_resolution_clock::now();
  elapsed_ms = duration_cast<milliseconds>(end - start).count();
  cout << "exp2() time: " << elapsed_ms << " ms" << endl;

  start = high_resolution_clock::now();
  result = v.array().asin();
  end = high_resolution_clock::now();
  elapsed_ms = duration_cast<milliseconds>(end - start).count();
  cout << "asin() time: " << elapsed_ms << " ms" << endl;

  start = high_resolution_clock::now();
  result = v.array().sinh();
  end = high_resolution_clock::now();
  elapsed_ms = duration_cast<milliseconds>(end - start).count();
  cout << "sinh() time: " << elapsed_ms << " ms" << endl;

  start = high_resolution_clock::now();
  result = v.array().acos();
  end = high_resolution_clock::now();
  elapsed_ms = duration_cast<milliseconds>(end - start).count();
  cout << "acos() time: " << elapsed_ms << " ms" << endl;

  start = high_resolution_clock::now();
  result = v.array().cosh();
  end = high_resolution_clock::now();
  elapsed_ms = duration_cast<milliseconds>(end - start).count();
  cout << "cosh() time: " << elapsed_ms << " ms" << endl;

  start = high_resolution_clock::now();
  result = v.array().tan();
  end = high_resolution_clock::now();
  elapsed_ms = duration_cast<milliseconds>(end - start).count();
  cout << "tan() time: " << elapsed_ms << " ms" << endl;

  start = high_resolution_clock::now();
  result = v.array().atan();
  end = high_resolution_clock::now();
  elapsed_ms = duration_cast<milliseconds>(end - start).count();
  cout << "atan() time: " << elapsed_ms << " ms" << endl;

  start = high_resolution_clock::now();
  result = v.array().tanh();
  end = high_resolution_clock::now();
  elapsed_ms = duration_cast<milliseconds>(end - start).count();
  cout << "tanh() time: " << elapsed_ms << " ms" << endl;

  VectorXd v2 = VectorXd::Random(size);
  start = high_resolution_clock::now();
  result = v.array() + v2.array();
  end = high_resolution_clock::now();
  elapsed_ms = duration_cast<milliseconds>(end - start).count();
  cout << "add() time: " << elapsed_ms << " ms" << endl;

  start = high_resolution_clock::now();
  result = v.array().pow(2.0);
  end = high_resolution_clock::now();
  elapsed_ms = duration_cast<milliseconds>(end - start).count();
  cout << "pow() time: " << elapsed_ms << " ms" << endl;

  start = high_resolution_clock::now();
  result = v.array().max(v2.array());
  end = high_resolution_clock::now();
  elapsed_ms = duration_cast<milliseconds>(end - start).count();
  cout << "max() time: " << elapsed_ms << " ms" << endl;

  start = high_resolution_clock::now();
  result = v.array().min(v2.array());
  end = high_resolution_clock::now();
  elapsed_ms = duration_cast<milliseconds>(end - start).count();
  cout << "min() time: " << elapsed_ms << " ms" << endl;
}

// Function to benchmark BLAS operation: Matrix multiplication.
void benchmarkMatrixMultiplication(int matSize) {
  cout << "\n--- BLIS-st DGEMM Benchmark (" << matSize << " x " << matSize
       << ") ---" << endl;

  MatrixXd A = MatrixXd::Random(matSize, matSize);
  MatrixXd B = MatrixXd::Random(matSize, matSize);
  MatrixXd C(matSize, matSize);

  auto start = high_resolution_clock::now();
  C = A * B;
  auto end = high_resolution_clock::now();
  double elapsed_ms = duration_cast<milliseconds>(end - start).count();
  cout << "Matrix multiplication time: " << elapsed_ms << " ms" << endl;
}

// Benchmark BLIS directly using its CBLAS interface if available.
void benchmarkBlisMultithreaded(int matSize, int numThreads) {
#if defined(EIGEN_AOCL_USE_BLIS_MT)
  cout << "\n--- BLIS-mt DGEMM Benchmark (" << matSize << " x " << matSize
       << ", threads=" << numThreads << ") ---" << endl;
  vector<double> A(matSize * matSize);
  vector<double> B(matSize * matSize);
  vector<double> C(matSize * matSize);
  for (auto &v : A)
    v = static_cast<double>(rand()) / RAND_MAX;
  for (auto &v : B)
    v = static_cast<double>(rand()) / RAND_MAX;
  double alpha = 1.0, beta = 0.0;
  string th = to_string(numThreads);
  setenv("BLIS_NUM_THREADS", th.c_str(), 1);
  auto start = high_resolution_clock::now();
  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, matSize, matSize,
              matSize, alpha, A.data(), matSize, B.data(), matSize, beta,
              C.data(), matSize);
  auto end = high_resolution_clock::now();
  double elapsed_ms = duration_cast<milliseconds>(end - start).count();
  cout << "BLIS dgemm time: " << elapsed_ms << " ms" << endl;
#else
  (void)matSize;
  (void)numThreads;
  cout << "\nBLIS multithreaded support not enabled." << endl;
#endif
}

// Function to benchmark LAPACK operation: Eigenvalue decomposition.
void benchmarkEigenDecomposition(int matSize) {
  cout << "\n--- Eigenvalue Decomposition Benchmark (Matrix Size: " << matSize
       << " x " << matSize << ") ---" << endl;
  MatrixXd M = MatrixXd::Random(matSize, matSize);
  // Make matrix symmetric (necessary for eigenvalue decomposition of
  // self-adjoint matrices)
  M = (M + M.transpose()) * 0.5;

  SelfAdjointEigenSolver<MatrixXd> eigensolver;
  auto start = high_resolution_clock::now();
  eigensolver.compute(M);
  auto end = high_resolution_clock::now();
  double elapsed_ms = duration_cast<milliseconds>(end - start).count();
  if (eigensolver.info() == Success) {
    cout << "Eigenvalue decomposition time: " << elapsed_ms << " ms" << endl;
  } else {
    cout << "Eigenvalue decomposition failed." << endl;
  }
}

// Function simulating a real-world FSI risk computation scenario.
// Example: Compute covariance matrix from simulated asset returns, then perform
// eigenvalue decomposition.
void benchmarkFSIRiskComputation(int numPeriods, int numAssets) {
  cout << "\n--- FSI Risk Computation Benchmark ---" << endl;
  cout << "Simulating " << numPeriods << " periods for " << numAssets
       << " assets." << endl;

  // Simulate asset returns: each column represents an asset's returns.
  MatrixXd returns = MatrixXd::Random(numPeriods, numAssets);

  // Compute covariance matrix: cov = (returns^T * returns) / (numPeriods - 1)
  auto start = high_resolution_clock::now();
  MatrixXd cov = (returns.transpose() * returns) / (numPeriods - 1);
  auto end = high_resolution_clock::now();
  double cov_time = duration_cast<milliseconds>(end - start).count();
  cout << "Covariance matrix computation time: " << cov_time << " ms" << endl;

  // Eigenvalue decomposition on covariance matrix.
  SelfAdjointEigenSolver<MatrixXd> eigensolver;
  start = high_resolution_clock::now();
  eigensolver.compute(cov);
  end = high_resolution_clock::now();
  double eig_time = duration_cast<milliseconds>(end - start).count();
  if (eigensolver.info() == Success) {
    cout << "Eigenvalue decomposition (covariance) time: " << eig_time << " ms"
         << endl;
    cout << "Top 3 Eigenvalues: "
         << eigensolver.eigenvalues().tail(3).transpose() << endl;
  } else {
    cout << "Eigenvalue decomposition failed." << endl;
  }
}

int main() {
  cout << "=== AOCL Benchmark for Eigen on AMD Platforms ===" << endl;
  cout << "Developer: Sharad Saurabh Bhaskar (shbhaska@amd.com)" << endl;
  cout << "Organization: Advanced Micro Devices, Inc." << endl;
  cout << "License: Mozilla Public License 2.0" << endl << endl;

  // Print AOCL configuration
#ifdef EIGEN_USE_AOCL_MT
  cout << "AOCL Mode: MULTITHREADED (MT)" << endl;
  cout << "Features: Multithreaded BLIS, AOCL VML, LAPACK" << endl;
#elif defined(EIGEN_USE_AOCL_ALL)
  cout << "AOCL Mode: SINGLE-THREADED (ALL)" << endl;
  cout << "Features: Single-threaded BLIS, AOCL VML, LAPACK" << endl;
#else
  cout << "AOCL Mode: DISABLED" << endl;
  cout << "Using standard Eigen implementation" << endl;
#endif
  cout << "Hardware threads available: " << thread::hardware_concurrency() << endl << endl;

  // Benchmark vector math functions with varying vector sizes.
  vector<int> vectorSizes = {5000000, 10000000, 50000000};
  for (int size : vectorSizes) {
    benchmarkVectorMath(size);
  }

  // Benchmark matrix multiplication for varying sizes.
  vector<int> matrixSizes = {1024};
  for (int msize : matrixSizes) {
    benchmarkMatrixMultiplication(msize);
#if defined(EIGEN_AOCL_USE_BLIS_MT)
    benchmarkBlisMultithreaded(msize, thread::hardware_concurrency());
#endif
  }

  // Benchmark LAPACK: Eigenvalue Decomposition.
  for (int msize : matrixSizes) {
    benchmarkEigenDecomposition(msize);
  }

  // Benchmark a complex FSI risk computation scenario.
  // For example, simulate 10,000 time periods (days) for 500 assets.
  benchmarkFSIRiskComputation(10000, 500);

  cout << "\n=== Benchmark Complete ===" << endl;
  return 0;
}