mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-01-18 17:31:19 +01:00
363 lines
13 KiB
C++
363 lines
13 KiB
C++
/*
|
|
* benchmark_aocl.cpp - AOCL Performance Benchmark Suite for Eigen
|
|
*
|
|
* Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
|
*
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
*
|
|
* Description:
|
|
* ------------
|
|
* This benchmark suite evaluates the performance of Eigen mathematical
|
|
* operations when integrated with AMD Optimizing CPU Libraries (AOCL). It
|
|
* tests:
|
|
*
|
|
* 1. Vector Math Operations: Transcendental functions (exp, sin, cos, sqrt,
|
|
* log, etc.) using AOCL Vector Math Library (VML) for optimized
|
|
* double-precision operations
|
|
*
|
|
* 2. Matrix Operations: BLAS Level-3 operations (DGEMM) using AOCL BLAS library
|
|
* with support for both single-threaded and multithreaded execution
|
|
*
|
|
* 3. Linear Algebra: LAPACK operations (eigenvalue decomposition) using
|
|
* libflame
|
|
*
|
|
* 4. Real-world Scenarios: Financial risk computation simulating covariance
|
|
* matrix calculations and eigenvalue analysis for portfolio optimization
|
|
*
|
|
* The benchmark automatically detects AOCL configuration and adjusts test
|
|
* execution accordingly, providing performance comparisons between standard
|
|
* Eigen operations and AOCL-accelerated implementations.
|
|
*
|
|
* Compilation:
|
|
* ------------
|
|
* # Using AOCC compiler (recommended for best AOCL compatibility):
|
|
* clang++ -O3 -g -DEIGEN_USE_AOCL_ALL -I<PATH_TO_EIGEN_INCLUDE>
|
|
* -I${AOCL_ROOT}/include \
|
|
* -Wno-parentheses src/benchmark_aocl.cpp -L${AOCL_ROOT}/lib \
|
|
* -lamdlibm -lm -lblis -lflame -lpthread -lrt -pthread \
|
|
* -o build/eigen_aocl_benchmark
|
|
*
|
|
* # Alternative: Using GCC with proper library paths:
|
|
* g++ -O3 -g -DEIGEN_USE_AOCL_ALL -I<PATH_TO_EIGEN_INCLUDE>
|
|
* -I${AOCL_ROOT}/include \
|
|
* -Wno-parentheses src/benchmark_aocl.cpp -L${AOCL_ROOT}/lib \
|
|
* -lamdlibm -lm -lblis -lflame -lpthread -lrt \
|
|
* -o build/eigen_aocl_benchmark
|
|
*
|
|
* # For multithreaded BLIS support:
|
|
* clang++ -O3 -g -fopenmp -DEIGEN_USE_AOCL_MT -I<PATH_TO_EIGEN_INCLUDE> \
|
|
* -I${AOCL_ROOT}/include -Wno-parentheses src/benchmark_aocl.cpp \
|
|
* -L${AOCL_ROOT}/lib -lamdlibm -lm -lblis-mt -lflame -lpthread -lrt \
|
|
* -o build/eigen_aocl_benchmark_mt
|
|
*
|
|
* Usage:
|
|
* ------
|
|
* export AOCL_ROOT=/path/to/aocl/installation
|
|
* export LD_LIBRARY_PATH=$AOCL_ROOT/lib:$LD_LIBRARY_PATH
|
|
* ./build/eigen_aocl_benchmark
|
|
*
|
|
* Developer:
|
|
* ----------
|
|
* Name: Sharad Saurabh Bhaskar
|
|
* Email: shbhaska@amd.com
|
|
* Organization: Advanced Micro Devices, Inc.
|
|
*/
|
|
|
|
#include <chrono>
|
|
#include <cstdlib>
|
|
#include <iostream>
|
|
#include <thread>
|
|
#include <vector>
|
|
|
|
// Simple - just include Eigen headers
|
|
#include <Eigen/Core>
|
|
#include <Eigen/Dense>
|
|
#include <Eigen/Eigenvalues>
|
|
|
|
// Only include CBLAS if AOCL BLIS is available
|
|
#ifdef EIGEN_USE_AOCL_ALL
|
|
#include <cblas.h>
|
|
#endif
|
|
|
|
using namespace std;
|
|
using namespace std::chrono;
|
|
using namespace Eigen;
|
|
|
|
void benchmarkVectorMath(int size) {
|
|
VectorXd v = VectorXd::LinSpaced(size, 0.1, 10.0);
|
|
VectorXd result(size);
|
|
double elapsed_ms = 0;
|
|
|
|
cout << "\n--- Vector Math Benchmark (size = " << size << ") ---" << endl;
|
|
|
|
auto start = high_resolution_clock::now();
|
|
result = v.array().exp();
|
|
auto end = high_resolution_clock::now();
|
|
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
|
cout << "exp() time: " << elapsed_ms << " ms" << endl;
|
|
|
|
start = high_resolution_clock::now();
|
|
result = v.array().sin();
|
|
end = high_resolution_clock::now();
|
|
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
|
cout << "sin() time: " << elapsed_ms << " ms" << endl;
|
|
|
|
start = high_resolution_clock::now();
|
|
result = v.array().cos();
|
|
end = high_resolution_clock::now();
|
|
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
|
cout << "cos() time: " << elapsed_ms << " ms" << endl;
|
|
|
|
start = high_resolution_clock::now();
|
|
result = v.array().sqrt();
|
|
end = high_resolution_clock::now();
|
|
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
|
cout << "sqrt() time: " << elapsed_ms << " ms" << endl;
|
|
|
|
start = high_resolution_clock::now();
|
|
result = v.array().cbrt();
|
|
end = high_resolution_clock::now();
|
|
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
|
cout << "cbrt() time: " << elapsed_ms << " ms" << endl;
|
|
|
|
start = high_resolution_clock::now();
|
|
result = v.array().abs();
|
|
end = high_resolution_clock::now();
|
|
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
|
cout << "abs() time: " << elapsed_ms << " ms" << endl;
|
|
|
|
start = high_resolution_clock::now();
|
|
result = v.array().log();
|
|
end = high_resolution_clock::now();
|
|
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
|
cout << "log() time: " << elapsed_ms << " ms" << endl;
|
|
|
|
start = high_resolution_clock::now();
|
|
result = v.array().log10();
|
|
end = high_resolution_clock::now();
|
|
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
|
cout << "log10() time: " << elapsed_ms << " ms" << endl;
|
|
|
|
start = high_resolution_clock::now();
|
|
result = v.array().exp2();
|
|
end = high_resolution_clock::now();
|
|
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
|
cout << "exp2() time: " << elapsed_ms << " ms" << endl;
|
|
|
|
start = high_resolution_clock::now();
|
|
result = v.array().asin();
|
|
end = high_resolution_clock::now();
|
|
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
|
cout << "asin() time: " << elapsed_ms << " ms" << endl;
|
|
|
|
start = high_resolution_clock::now();
|
|
result = v.array().sinh();
|
|
end = high_resolution_clock::now();
|
|
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
|
cout << "sinh() time: " << elapsed_ms << " ms" << endl;
|
|
|
|
start = high_resolution_clock::now();
|
|
result = v.array().acos();
|
|
end = high_resolution_clock::now();
|
|
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
|
cout << "acos() time: " << elapsed_ms << " ms" << endl;
|
|
|
|
start = high_resolution_clock::now();
|
|
result = v.array().cosh();
|
|
end = high_resolution_clock::now();
|
|
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
|
cout << "cosh() time: " << elapsed_ms << " ms" << endl;
|
|
|
|
start = high_resolution_clock::now();
|
|
result = v.array().tan();
|
|
end = high_resolution_clock::now();
|
|
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
|
cout << "tan() time: " << elapsed_ms << " ms" << endl;
|
|
|
|
start = high_resolution_clock::now();
|
|
result = v.array().atan();
|
|
end = high_resolution_clock::now();
|
|
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
|
cout << "atan() time: " << elapsed_ms << " ms" << endl;
|
|
|
|
start = high_resolution_clock::now();
|
|
result = v.array().tanh();
|
|
end = high_resolution_clock::now();
|
|
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
|
cout << "tanh() time: " << elapsed_ms << " ms" << endl;
|
|
|
|
VectorXd v2 = VectorXd::Random(size);
|
|
start = high_resolution_clock::now();
|
|
result = v.array() + v2.array();
|
|
end = high_resolution_clock::now();
|
|
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
|
cout << "add() time: " << elapsed_ms << " ms" << endl;
|
|
|
|
start = high_resolution_clock::now();
|
|
result = v.array().pow(2.0);
|
|
end = high_resolution_clock::now();
|
|
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
|
cout << "pow() time: " << elapsed_ms << " ms" << endl;
|
|
|
|
start = high_resolution_clock::now();
|
|
result = v.array().max(v2.array());
|
|
end = high_resolution_clock::now();
|
|
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
|
cout << "max() time: " << elapsed_ms << " ms" << endl;
|
|
|
|
start = high_resolution_clock::now();
|
|
result = v.array().min(v2.array());
|
|
end = high_resolution_clock::now();
|
|
elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
|
cout << "min() time: " << elapsed_ms << " ms" << endl;
|
|
}
|
|
|
|
// Function to benchmark BLAS operation: Matrix multiplication.
|
|
void benchmarkMatrixMultiplication(int matSize) {
|
|
cout << "\n--- BLIS-st DGEMM Benchmark (" << matSize << " x " << matSize
|
|
<< ") ---" << endl;
|
|
|
|
MatrixXd A = MatrixXd::Random(matSize, matSize);
|
|
MatrixXd B = MatrixXd::Random(matSize, matSize);
|
|
MatrixXd C(matSize, matSize);
|
|
|
|
auto start = high_resolution_clock::now();
|
|
C = A * B;
|
|
auto end = high_resolution_clock::now();
|
|
double elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
|
cout << "Matrix multiplication time: " << elapsed_ms << " ms" << endl;
|
|
}
|
|
|
|
// Benchmark BLIS directly using its CBLAS interface if available.
|
|
void benchmarkBlisMultithreaded(int matSize, int numThreads) {
|
|
#if defined(EIGEN_AOCL_USE_BLIS_MT)
|
|
cout << "\n--- BLIS-mt DGEMM Benchmark (" << matSize << " x " << matSize
|
|
<< ", threads=" << numThreads << ") ---" << endl;
|
|
vector<double> A(matSize * matSize);
|
|
vector<double> B(matSize * matSize);
|
|
vector<double> C(matSize * matSize);
|
|
for (auto &v : A)
|
|
v = static_cast<double>(rand()) / RAND_MAX;
|
|
for (auto &v : B)
|
|
v = static_cast<double>(rand()) / RAND_MAX;
|
|
double alpha = 1.0, beta = 0.0;
|
|
string th = to_string(numThreads);
|
|
setenv("BLIS_NUM_THREADS", th.c_str(), 1);
|
|
auto start = high_resolution_clock::now();
|
|
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, matSize, matSize,
|
|
matSize, alpha, A.data(), matSize, B.data(), matSize, beta,
|
|
C.data(), matSize);
|
|
auto end = high_resolution_clock::now();
|
|
double elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
|
cout << "BLIS dgemm time: " << elapsed_ms << " ms" << endl;
|
|
#else
|
|
(void)matSize;
|
|
(void)numThreads;
|
|
cout << "\nBLIS multithreaded support not enabled." << endl;
|
|
#endif
|
|
}
|
|
|
|
// Function to benchmark LAPACK operation: Eigenvalue decomposition.
|
|
void benchmarkEigenDecomposition(int matSize) {
|
|
cout << "\n--- Eigenvalue Decomposition Benchmark (Matrix Size: " << matSize
|
|
<< " x " << matSize << ") ---" << endl;
|
|
MatrixXd M = MatrixXd::Random(matSize, matSize);
|
|
// Make matrix symmetric (necessary for eigenvalue decomposition of
|
|
// self-adjoint matrices)
|
|
M = (M + M.transpose()) * 0.5;
|
|
|
|
SelfAdjointEigenSolver<MatrixXd> eigensolver;
|
|
auto start = high_resolution_clock::now();
|
|
eigensolver.compute(M);
|
|
auto end = high_resolution_clock::now();
|
|
double elapsed_ms = duration_cast<milliseconds>(end - start).count();
|
|
if (eigensolver.info() == Success) {
|
|
cout << "Eigenvalue decomposition time: " << elapsed_ms << " ms" << endl;
|
|
} else {
|
|
cout << "Eigenvalue decomposition failed." << endl;
|
|
}
|
|
}
|
|
|
|
// Function simulating a real-world FSI risk computation scenario.
|
|
// Example: Compute covariance matrix from simulated asset returns, then perform
|
|
// eigenvalue decomposition.
|
|
void benchmarkFSIRiskComputation(int numPeriods, int numAssets) {
|
|
cout << "\n--- FSI Risk Computation Benchmark ---" << endl;
|
|
cout << "Simulating " << numPeriods << " periods for " << numAssets
|
|
<< " assets." << endl;
|
|
|
|
// Simulate asset returns: each column represents an asset's returns.
|
|
MatrixXd returns = MatrixXd::Random(numPeriods, numAssets);
|
|
|
|
// Compute covariance matrix: cov = (returns^T * returns) / (numPeriods - 1)
|
|
auto start = high_resolution_clock::now();
|
|
MatrixXd cov = (returns.transpose() * returns) / (numPeriods - 1);
|
|
auto end = high_resolution_clock::now();
|
|
double cov_time = duration_cast<milliseconds>(end - start).count();
|
|
cout << "Covariance matrix computation time: " << cov_time << " ms" << endl;
|
|
|
|
// Eigenvalue decomposition on covariance matrix.
|
|
SelfAdjointEigenSolver<MatrixXd> eigensolver;
|
|
start = high_resolution_clock::now();
|
|
eigensolver.compute(cov);
|
|
end = high_resolution_clock::now();
|
|
double eig_time = duration_cast<milliseconds>(end - start).count();
|
|
if (eigensolver.info() == Success) {
|
|
cout << "Eigenvalue decomposition (covariance) time: " << eig_time << " ms"
|
|
<< endl;
|
|
cout << "Top 3 Eigenvalues: "
|
|
<< eigensolver.eigenvalues().tail(3).transpose() << endl;
|
|
} else {
|
|
cout << "Eigenvalue decomposition failed." << endl;
|
|
}
|
|
}
|
|
|
|
int main() {
|
|
cout << "=== AOCL Benchmark for Eigen on AMD Platforms ===" << endl;
|
|
cout << "Developer: Sharad Saurabh Bhaskar (shbhaska@amd.com)" << endl;
|
|
cout << "Organization: Advanced Micro Devices, Inc." << endl;
|
|
cout << "License: Mozilla Public License 2.0" << endl << endl;
|
|
|
|
// Print AOCL configuration
|
|
#ifdef EIGEN_USE_AOCL_MT
|
|
cout << "AOCL Mode: MULTITHREADED (MT)" << endl;
|
|
cout << "Features: Multithreaded BLIS, AOCL VML, LAPACK" << endl;
|
|
#elif defined(EIGEN_USE_AOCL_ALL)
|
|
cout << "AOCL Mode: SINGLE-THREADED (ALL)" << endl;
|
|
cout << "Features: Single-threaded BLIS, AOCL VML, LAPACK" << endl;
|
|
#else
|
|
cout << "AOCL Mode: DISABLED" << endl;
|
|
cout << "Using standard Eigen implementation" << endl;
|
|
#endif
|
|
cout << "Hardware threads available: " << thread::hardware_concurrency() << endl << endl;
|
|
|
|
// Benchmark vector math functions with varying vector sizes.
|
|
vector<int> vectorSizes = {5000000, 10000000, 50000000};
|
|
for (int size : vectorSizes) {
|
|
benchmarkVectorMath(size);
|
|
}
|
|
|
|
// Benchmark matrix multiplication for varying sizes.
|
|
vector<int> matrixSizes = {1024};
|
|
for (int msize : matrixSizes) {
|
|
benchmarkMatrixMultiplication(msize);
|
|
#if defined(EIGEN_AOCL_USE_BLIS_MT)
|
|
benchmarkBlisMultithreaded(msize, thread::hardware_concurrency());
|
|
#endif
|
|
}
|
|
|
|
// Benchmark LAPACK: Eigenvalue Decomposition.
|
|
for (int msize : matrixSizes) {
|
|
benchmarkEigenDecomposition(msize);
|
|
}
|
|
|
|
// Benchmark a complex FSI risk computation scenario.
|
|
// For example, simulate 10,000 time periods (days) for 500 assets.
|
|
benchmarkFSIRiskComputation(10000, 500);
|
|
|
|
cout << "\n=== Benchmark Complete ===" << endl;
|
|
return 0;
|
|
}
|